root/net/ipv4/tcp_ipv4.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_v4_init_seq
  2. tcp_v4_init_ts_off
  3. tcp_twsk_unique
  4. tcp_v4_pre_connect
  5. tcp_v4_connect
  6. tcp_v4_mtu_reduced
  7. do_redirect
  8. tcp_req_err
  9. tcp_v4_err
  10. __tcp_v4_send_check
  11. tcp_v4_send_check
  12. tcp_v4_send_reset
  13. tcp_v4_send_ack
  14. tcp_v4_timewait_ack
  15. tcp_v4_reqsk_send_ack
  16. tcp_v4_send_synack
  17. tcp_v4_reqsk_destructor
  18. __tcp_md5_do_lookup
  19. tcp_md5_do_lookup_exact
  20. tcp_v4_md5_lookup
  21. tcp_md5_do_add
  22. tcp_md5_do_del
  23. tcp_clear_md5_list
  24. tcp_v4_parse_md5_keys
  25. tcp_v4_md5_hash_headers
  26. tcp_v4_md5_hash_hdr
  27. tcp_v4_md5_hash_skb
  28. tcp_v4_inbound_md5_hash
  29. tcp_v4_init_req
  30. tcp_v4_route_req
  31. tcp_v4_conn_request
  32. tcp_v4_syn_recv_sock
  33. tcp_v4_cookie_check
  34. tcp_v4_get_syncookie
  35. tcp_v4_do_rcv
  36. tcp_v4_early_demux
  37. tcp_add_backlog
  38. tcp_filter
  39. tcp_v4_restore_cb
  40. tcp_v4_fill_cb
  41. tcp_v4_rcv
  42. inet_sk_rx_dst_set
  43. tcp_v4_init_sock
  44. tcp_v4_destroy_sock
  45. listening_get_next
  46. listening_get_idx
  47. empty_bucket
  48. established_get_first
  49. established_get_next
  50. established_get_idx
  51. tcp_get_idx
  52. tcp_seek_last_pos
  53. tcp_seq_start
  54. tcp_seq_next
  55. tcp_seq_stop
  56. get_openreq4
  57. get_tcp4_sock
  58. get_timewait4_sock
  59. tcp4_seq_show
  60. tcp4_proc_init_net
  61. tcp4_proc_exit_net
  62. tcp4_proc_init
  63. tcp4_proc_exit
  64. tcp_sk_exit
  65. tcp_sk_init
  66. tcp_sk_exit_batch
  67. tcp_v4_init

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18 
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47 
  48 #define pr_fmt(fmt) "TCP: " fmt
  49 
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60 
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72 
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82 
  83 #include <trace/events/tcp.h>
  84 
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89 
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92 
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100 
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105 
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112 
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140 
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144 
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148 
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 168 
 169                         if (!seq)
 170                                 seq = 1;
 171                         WRITE_ONCE(tp->write_seq, seq);
 172                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 173                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 174                 }
 175                 sock_hold(sktw);
 176                 return 1;
 177         }
 178 
 179         return 0;
 180 }
 181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 182 
 183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 184                               int addr_len)
 185 {
 186         /* This check is replicated from tcp_v4_connect() and intended to
 187          * prevent BPF program called below from accessing bytes that are out
 188          * of the bound specified by user in addr_len.
 189          */
 190         if (addr_len < sizeof(struct sockaddr_in))
 191                 return -EINVAL;
 192 
 193         sock_owned_by_me(sk);
 194 
 195         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 196 }
 197 
 198 /* This will initiate an outgoing connection. */
 199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 200 {
 201         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 202         struct inet_sock *inet = inet_sk(sk);
 203         struct tcp_sock *tp = tcp_sk(sk);
 204         __be16 orig_sport, orig_dport;
 205         __be32 daddr, nexthop;
 206         struct flowi4 *fl4;
 207         struct rtable *rt;
 208         int err;
 209         struct ip_options_rcu *inet_opt;
 210         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 211 
 212         if (addr_len < sizeof(struct sockaddr_in))
 213                 return -EINVAL;
 214 
 215         if (usin->sin_family != AF_INET)
 216                 return -EAFNOSUPPORT;
 217 
 218         nexthop = daddr = usin->sin_addr.s_addr;
 219         inet_opt = rcu_dereference_protected(inet->inet_opt,
 220                                              lockdep_sock_is_held(sk));
 221         if (inet_opt && inet_opt->opt.srr) {
 222                 if (!daddr)
 223                         return -EINVAL;
 224                 nexthop = inet_opt->opt.faddr;
 225         }
 226 
 227         orig_sport = inet->inet_sport;
 228         orig_dport = usin->sin_port;
 229         fl4 = &inet->cork.fl.u.ip4;
 230         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 231                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 232                               IPPROTO_TCP,
 233                               orig_sport, orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240 
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245 
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248 
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252 
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260 
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263 
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267 
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269 
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279 
 280         sk_set_txhash(sk);
 281 
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293 
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305 
 306         inet->inet_id = prandom_u32();
 307 
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312 
 313         err = tcp_connect(sk);
 314 
 315         if (err)
 316                 goto failure;
 317 
 318         return 0;
 319 
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         ip_rt_put(rt);
 327         sk->sk_route_caps = 0;
 328         inet->inet_dport = 0;
 329         return err;
 330 }
 331 EXPORT_SYMBOL(tcp_v4_connect);
 332 
 333 /*
 334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335  * It can be called through tcp_release_cb() if socket was owned by user
 336  * at the time tcp_v4_err() was called to handle ICMP message.
 337  */
 338 void tcp_v4_mtu_reduced(struct sock *sk)
 339 {
 340         struct inet_sock *inet = inet_sk(sk);
 341         struct dst_entry *dst;
 342         u32 mtu;
 343 
 344         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                 return;
 346         mtu = tcp_sk(sk)->mtu_info;
 347         dst = inet_csk_update_pmtu(sk, mtu);
 348         if (!dst)
 349                 return;
 350 
 351         /* Something is about to be wrong... Remember soft error
 352          * for the case, if this connection will not able to recover.
 353          */
 354         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                 sk->sk_err_soft = EMSGSIZE;
 356 
 357         mtu = dst_mtu(dst);
 358 
 359         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360             ip_sk_accept_pmtu(sk) &&
 361             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                 tcp_sync_mss(sk, mtu);
 363 
 364                 /* Resend the TCP packet because it's
 365                  * clear that the old packet has been
 366                  * dropped. This is the new "fast" path mtu
 367                  * discovery.
 368                  */
 369                 tcp_simple_retransmit(sk);
 370         } /* else let the usual retransmit timer handle it */
 371 }
 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373 
 374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375 {
 376         struct dst_entry *dst = __sk_dst_check(sk, 0);
 377 
 378         if (dst)
 379                 dst->ops->redirect(dst, sk, skb);
 380 }
 381 
 382 
 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385 {
 386         struct request_sock *req = inet_reqsk(sk);
 387         struct net *net = sock_net(sk);
 388 
 389         /* ICMPs are not backlogged, hence we cannot get
 390          * an established socket here.
 391          */
 392         if (seq != tcp_rsk(req)->snt_isn) {
 393                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394         } else if (abort) {
 395                 /*
 396                  * Still in SYN_RECV, just remove it silently.
 397                  * There is no good way to pass the error to the newly
 398                  * created socket, and POSIX does not want network
 399                  * errors returned from accept().
 400                  */
 401                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                 tcp_listendrop(req->rsk_listener);
 403         }
 404         reqsk_put(req);
 405 }
 406 EXPORT_SYMBOL(tcp_req_err);
 407 
 408 /*
 409  * This routine is called by the ICMP module when it gets some
 410  * sort of error condition.  If err < 0 then the socket should
 411  * be closed and the error returned to the user.  If err > 0
 412  * it's just the icmp type << 8 | icmp code.  After adjustment
 413  * header points to the first 8 bytes of the tcp header.  We need
 414  * to find the appropriate port.
 415  *
 416  * The locking strategy used here is very "optimistic". When
 417  * someone else accesses the socket the ICMP is just dropped
 418  * and for some paths there is no check at all.
 419  * A more general error queue to queue errors for later handling
 420  * is probably better.
 421  *
 422  */
 423 
 424 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 425 {
 426         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 427         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 428         struct inet_connection_sock *icsk;
 429         struct tcp_sock *tp;
 430         struct inet_sock *inet;
 431         const int type = icmp_hdr(icmp_skb)->type;
 432         const int code = icmp_hdr(icmp_skb)->code;
 433         struct sock *sk;
 434         struct sk_buff *skb;
 435         struct request_sock *fastopen;
 436         u32 seq, snd_una;
 437         s32 remaining;
 438         u32 delta_us;
 439         int err;
 440         struct net *net = dev_net(icmp_skb->dev);
 441 
 442         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 443                                        th->dest, iph->saddr, ntohs(th->source),
 444                                        inet_iif(icmp_skb), 0);
 445         if (!sk) {
 446                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 447                 return -ENOENT;
 448         }
 449         if (sk->sk_state == TCP_TIME_WAIT) {
 450                 inet_twsk_put(inet_twsk(sk));
 451                 return 0;
 452         }
 453         seq = ntohl(th->seq);
 454         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 455                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 456                                      type == ICMP_TIME_EXCEEDED ||
 457                                      (type == ICMP_DEST_UNREACH &&
 458                                       (code == ICMP_NET_UNREACH ||
 459                                        code == ICMP_HOST_UNREACH)));
 460                 return 0;
 461         }
 462 
 463         bh_lock_sock(sk);
 464         /* If too many ICMPs get dropped on busy
 465          * servers this needs to be solved differently.
 466          * We do take care of PMTU discovery (RFC1191) special case :
 467          * we can receive locally generated ICMP messages while socket is held.
 468          */
 469         if (sock_owned_by_user(sk)) {
 470                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 471                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 472         }
 473         if (sk->sk_state == TCP_CLOSE)
 474                 goto out;
 475 
 476         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 477                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 478                 goto out;
 479         }
 480 
 481         icsk = inet_csk(sk);
 482         tp = tcp_sk(sk);
 483         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 484         fastopen = rcu_dereference(tp->fastopen_rsk);
 485         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 486         if (sk->sk_state != TCP_LISTEN &&
 487             !between(seq, snd_una, tp->snd_nxt)) {
 488                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                 goto out;
 490         }
 491 
 492         switch (type) {
 493         case ICMP_REDIRECT:
 494                 if (!sock_owned_by_user(sk))
 495                         do_redirect(icmp_skb, sk);
 496                 goto out;
 497         case ICMP_SOURCE_QUENCH:
 498                 /* Just silently ignore these. */
 499                 goto out;
 500         case ICMP_PARAMETERPROB:
 501                 err = EPROTO;
 502                 break;
 503         case ICMP_DEST_UNREACH:
 504                 if (code > NR_ICMP_UNREACH)
 505                         goto out;
 506 
 507                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 508                         /* We are not interested in TCP_LISTEN and open_requests
 509                          * (SYN-ACKs send out by Linux are always <576bytes so
 510                          * they should go through unfragmented).
 511                          */
 512                         if (sk->sk_state == TCP_LISTEN)
 513                                 goto out;
 514 
 515                         tp->mtu_info = info;
 516                         if (!sock_owned_by_user(sk)) {
 517                                 tcp_v4_mtu_reduced(sk);
 518                         } else {
 519                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 520                                         sock_hold(sk);
 521                         }
 522                         goto out;
 523                 }
 524 
 525                 err = icmp_err_convert[code].errno;
 526                 /* check if icmp_skb allows revert of backoff
 527                  * (see draft-zimmermann-tcp-lcd) */
 528                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 529                         break;
 530                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 531                     !icsk->icsk_backoff || fastopen)
 532                         break;
 533 
 534                 if (sock_owned_by_user(sk))
 535                         break;
 536 
 537                 skb = tcp_rtx_queue_head(sk);
 538                 if (WARN_ON_ONCE(!skb))
 539                         break;
 540 
 541                 icsk->icsk_backoff--;
 542                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 543                                                TCP_TIMEOUT_INIT;
 544                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 545 
 546 
 547                 tcp_mstamp_refresh(tp);
 548                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 549                 remaining = icsk->icsk_rto -
 550                             usecs_to_jiffies(delta_us);
 551 
 552                 if (remaining > 0) {
 553                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 554                                                   remaining, TCP_RTO_MAX);
 555                 } else {
 556                         /* RTO revert clocked out retransmission.
 557                          * Will retransmit now */
 558                         tcp_retransmit_timer(sk);
 559                 }
 560 
 561                 break;
 562         case ICMP_TIME_EXCEEDED:
 563                 err = EHOSTUNREACH;
 564                 break;
 565         default:
 566                 goto out;
 567         }
 568 
 569         switch (sk->sk_state) {
 570         case TCP_SYN_SENT:
 571         case TCP_SYN_RECV:
 572                 /* Only in fast or simultaneous open. If a fast open socket is
 573                  * is already accepted it is treated as a connected one below.
 574                  */
 575                 if (fastopen && !fastopen->sk)
 576                         break;
 577 
 578                 if (!sock_owned_by_user(sk)) {
 579                         sk->sk_err = err;
 580 
 581                         sk->sk_error_report(sk);
 582 
 583                         tcp_done(sk);
 584                 } else {
 585                         sk->sk_err_soft = err;
 586                 }
 587                 goto out;
 588         }
 589 
 590         /* If we've already connected we will keep trying
 591          * until we time out, or the user gives up.
 592          *
 593          * rfc1122 4.2.3.9 allows to consider as hard errors
 594          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 595          * but it is obsoleted by pmtu discovery).
 596          *
 597          * Note, that in modern internet, where routing is unreliable
 598          * and in each dark corner broken firewalls sit, sending random
 599          * errors ordered by their masters even this two messages finally lose
 600          * their original sense (even Linux sends invalid PORT_UNREACHs)
 601          *
 602          * Now we are in compliance with RFCs.
 603          *                                                      --ANK (980905)
 604          */
 605 
 606         inet = inet_sk(sk);
 607         if (!sock_owned_by_user(sk) && inet->recverr) {
 608                 sk->sk_err = err;
 609                 sk->sk_error_report(sk);
 610         } else  { /* Only an error on timeout */
 611                 sk->sk_err_soft = err;
 612         }
 613 
 614 out:
 615         bh_unlock_sock(sk);
 616         sock_put(sk);
 617         return 0;
 618 }
 619 
 620 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 621 {
 622         struct tcphdr *th = tcp_hdr(skb);
 623 
 624         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 625         skb->csum_start = skb_transport_header(skb) - skb->head;
 626         skb->csum_offset = offsetof(struct tcphdr, check);
 627 }
 628 
 629 /* This routine computes an IPv4 TCP checksum. */
 630 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 631 {
 632         const struct inet_sock *inet = inet_sk(sk);
 633 
 634         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 635 }
 636 EXPORT_SYMBOL(tcp_v4_send_check);
 637 
 638 /*
 639  *      This routine will send an RST to the other tcp.
 640  *
 641  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 642  *                    for reset.
 643  *      Answer: if a packet caused RST, it is not for a socket
 644  *              existing in our system, if it is matched to a socket,
 645  *              it is just duplicate segment or bug in other side's TCP.
 646  *              So that we build reply only basing on parameters
 647  *              arrived with segment.
 648  *      Exception: precedence violation. We do not implement it in any case.
 649  */
 650 
 651 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 652 {
 653         const struct tcphdr *th = tcp_hdr(skb);
 654         struct {
 655                 struct tcphdr th;
 656 #ifdef CONFIG_TCP_MD5SIG
 657                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 658 #endif
 659         } rep;
 660         struct ip_reply_arg arg;
 661 #ifdef CONFIG_TCP_MD5SIG
 662         struct tcp_md5sig_key *key = NULL;
 663         const __u8 *hash_location = NULL;
 664         unsigned char newhash[16];
 665         int genhash;
 666         struct sock *sk1 = NULL;
 667 #endif
 668         u64 transmit_time = 0;
 669         struct sock *ctl_sk;
 670         struct net *net;
 671 
 672         /* Never send a reset in response to a reset. */
 673         if (th->rst)
 674                 return;
 675 
 676         /* If sk not NULL, it means we did a successful lookup and incoming
 677          * route had to be correct. prequeue might have dropped our dst.
 678          */
 679         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 680                 return;
 681 
 682         /* Swap the send and the receive. */
 683         memset(&rep, 0, sizeof(rep));
 684         rep.th.dest   = th->source;
 685         rep.th.source = th->dest;
 686         rep.th.doff   = sizeof(struct tcphdr) / 4;
 687         rep.th.rst    = 1;
 688 
 689         if (th->ack) {
 690                 rep.th.seq = th->ack_seq;
 691         } else {
 692                 rep.th.ack = 1;
 693                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 694                                        skb->len - (th->doff << 2));
 695         }
 696 
 697         memset(&arg, 0, sizeof(arg));
 698         arg.iov[0].iov_base = (unsigned char *)&rep;
 699         arg.iov[0].iov_len  = sizeof(rep.th);
 700 
 701         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 702 #ifdef CONFIG_TCP_MD5SIG
 703         rcu_read_lock();
 704         hash_location = tcp_parse_md5sig_option(th);
 705         if (sk && sk_fullsock(sk)) {
 706                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 707                                         &ip_hdr(skb)->saddr, AF_INET);
 708         } else if (hash_location) {
 709                 /*
 710                  * active side is lost. Try to find listening socket through
 711                  * source port, and then find md5 key through listening socket.
 712                  * we are not loose security here:
 713                  * Incoming packet is checked with md5 hash with finding key,
 714                  * no RST generated if md5 hash doesn't match.
 715                  */
 716                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 717                                              ip_hdr(skb)->saddr,
 718                                              th->source, ip_hdr(skb)->daddr,
 719                                              ntohs(th->source), inet_iif(skb),
 720                                              tcp_v4_sdif(skb));
 721                 /* don't send rst if it can't find key */
 722                 if (!sk1)
 723                         goto out;
 724 
 725                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 726                                         &ip_hdr(skb)->saddr, AF_INET);
 727                 if (!key)
 728                         goto out;
 729 
 730 
 731                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 732                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 733                         goto out;
 734 
 735         }
 736 
 737         if (key) {
 738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 739                                    (TCPOPT_NOP << 16) |
 740                                    (TCPOPT_MD5SIG << 8) |
 741                                    TCPOLEN_MD5SIG);
 742                 /* Update length and the length the header thinks exists */
 743                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 744                 rep.th.doff = arg.iov[0].iov_len / 4;
 745 
 746                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 747                                      key, ip_hdr(skb)->saddr,
 748                                      ip_hdr(skb)->daddr, &rep.th);
 749         }
 750 #endif
 751         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 752                                       ip_hdr(skb)->saddr, /* XXX */
 753                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 754         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 755         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 756 
 757         /* When socket is gone, all binding information is lost.
 758          * routing might fail in this case. No choice here, if we choose to force
 759          * input interface, we will misroute in case of asymmetric route.
 760          */
 761         if (sk) {
 762                 arg.bound_dev_if = sk->sk_bound_dev_if;
 763                 if (sk_fullsock(sk))
 764                         trace_tcp_send_reset(sk, skb);
 765         }
 766 
 767         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 768                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 769 
 770         arg.tos = ip_hdr(skb)->tos;
 771         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 772         local_bh_disable();
 773         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 774         if (sk) {
 775                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 776                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 777                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 778                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 779                 transmit_time = tcp_transmit_time(sk);
 780         }
 781         ip_send_unicast_reply(ctl_sk,
 782                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 783                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 784                               &arg, arg.iov[0].iov_len,
 785                               transmit_time);
 786 
 787         ctl_sk->sk_mark = 0;
 788         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 789         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 790         local_bh_enable();
 791 
 792 #ifdef CONFIG_TCP_MD5SIG
 793 out:
 794         rcu_read_unlock();
 795 #endif
 796 }
 797 
 798 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 799    outside socket context is ugly, certainly. What can I do?
 800  */
 801 
 802 static void tcp_v4_send_ack(const struct sock *sk,
 803                             struct sk_buff *skb, u32 seq, u32 ack,
 804                             u32 win, u32 tsval, u32 tsecr, int oif,
 805                             struct tcp_md5sig_key *key,
 806                             int reply_flags, u8 tos)
 807 {
 808         const struct tcphdr *th = tcp_hdr(skb);
 809         struct {
 810                 struct tcphdr th;
 811                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 812 #ifdef CONFIG_TCP_MD5SIG
 813                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 814 #endif
 815                         ];
 816         } rep;
 817         struct net *net = sock_net(sk);
 818         struct ip_reply_arg arg;
 819         struct sock *ctl_sk;
 820         u64 transmit_time;
 821 
 822         memset(&rep.th, 0, sizeof(struct tcphdr));
 823         memset(&arg, 0, sizeof(arg));
 824 
 825         arg.iov[0].iov_base = (unsigned char *)&rep;
 826         arg.iov[0].iov_len  = sizeof(rep.th);
 827         if (tsecr) {
 828                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 829                                    (TCPOPT_TIMESTAMP << 8) |
 830                                    TCPOLEN_TIMESTAMP);
 831                 rep.opt[1] = htonl(tsval);
 832                 rep.opt[2] = htonl(tsecr);
 833                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 834         }
 835 
 836         /* Swap the send and the receive. */
 837         rep.th.dest    = th->source;
 838         rep.th.source  = th->dest;
 839         rep.th.doff    = arg.iov[0].iov_len / 4;
 840         rep.th.seq     = htonl(seq);
 841         rep.th.ack_seq = htonl(ack);
 842         rep.th.ack     = 1;
 843         rep.th.window  = htons(win);
 844 
 845 #ifdef CONFIG_TCP_MD5SIG
 846         if (key) {
 847                 int offset = (tsecr) ? 3 : 0;
 848 
 849                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 850                                           (TCPOPT_NOP << 16) |
 851                                           (TCPOPT_MD5SIG << 8) |
 852                                           TCPOLEN_MD5SIG);
 853                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 854                 rep.th.doff = arg.iov[0].iov_len/4;
 855 
 856                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 857                                     key, ip_hdr(skb)->saddr,
 858                                     ip_hdr(skb)->daddr, &rep.th);
 859         }
 860 #endif
 861         arg.flags = reply_flags;
 862         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 863                                       ip_hdr(skb)->saddr, /* XXX */
 864                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 865         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 866         if (oif)
 867                 arg.bound_dev_if = oif;
 868         arg.tos = tos;
 869         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 870         local_bh_disable();
 871         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 872         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 873                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 874         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 875                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 876         transmit_time = tcp_transmit_time(sk);
 877         ip_send_unicast_reply(ctl_sk,
 878                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 879                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 880                               &arg, arg.iov[0].iov_len,
 881                               transmit_time);
 882 
 883         ctl_sk->sk_mark = 0;
 884         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 885         local_bh_enable();
 886 }
 887 
 888 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 889 {
 890         struct inet_timewait_sock *tw = inet_twsk(sk);
 891         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 892 
 893         tcp_v4_send_ack(sk, skb,
 894                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 895                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 896                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 897                         tcptw->tw_ts_recent,
 898                         tw->tw_bound_dev_if,
 899                         tcp_twsk_md5_key(tcptw),
 900                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 901                         tw->tw_tos
 902                         );
 903 
 904         inet_twsk_put(tw);
 905 }
 906 
 907 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 908                                   struct request_sock *req)
 909 {
 910         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 911          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 912          */
 913         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 914                                              tcp_sk(sk)->snd_nxt;
 915 
 916         /* RFC 7323 2.3
 917          * The window field (SEG.WND) of every outgoing segment, with the
 918          * exception of <SYN> segments, MUST be right-shifted by
 919          * Rcv.Wind.Shift bits:
 920          */
 921         tcp_v4_send_ack(sk, skb, seq,
 922                         tcp_rsk(req)->rcv_nxt,
 923                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 924                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 925                         req->ts_recent,
 926                         0,
 927                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 928                                           AF_INET),
 929                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 930                         ip_hdr(skb)->tos);
 931 }
 932 
 933 /*
 934  *      Send a SYN-ACK after having received a SYN.
 935  *      This still operates on a request_sock only, not on a big
 936  *      socket.
 937  */
 938 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 939                               struct flowi *fl,
 940                               struct request_sock *req,
 941                               struct tcp_fastopen_cookie *foc,
 942                               enum tcp_synack_type synack_type)
 943 {
 944         const struct inet_request_sock *ireq = inet_rsk(req);
 945         struct flowi4 fl4;
 946         int err = -1;
 947         struct sk_buff *skb;
 948 
 949         /* First, grab a route. */
 950         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 951                 return -1;
 952 
 953         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 954 
 955         if (skb) {
 956                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 957 
 958                 rcu_read_lock();
 959                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 960                                             ireq->ir_rmt_addr,
 961                                             rcu_dereference(ireq->ireq_opt));
 962                 rcu_read_unlock();
 963                 err = net_xmit_eval(err);
 964         }
 965 
 966         return err;
 967 }
 968 
 969 /*
 970  *      IPv4 request_sock destructor.
 971  */
 972 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 973 {
 974         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 975 }
 976 
 977 #ifdef CONFIG_TCP_MD5SIG
 978 /*
 979  * RFC2385 MD5 checksumming requires a mapping of
 980  * IP address->MD5 Key.
 981  * We need to maintain these in the sk structure.
 982  */
 983 
 984 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 985 EXPORT_SYMBOL(tcp_md5_needed);
 986 
 987 /* Find the Key structure for an address.  */
 988 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 989                                            const union tcp_md5_addr *addr,
 990                                            int family)
 991 {
 992         const struct tcp_sock *tp = tcp_sk(sk);
 993         struct tcp_md5sig_key *key;
 994         const struct tcp_md5sig_info *md5sig;
 995         __be32 mask;
 996         struct tcp_md5sig_key *best_match = NULL;
 997         bool match;
 998 
 999         /* caller either holds rcu_read_lock() or socket lock */
1000         md5sig = rcu_dereference_check(tp->md5sig_info,
1001                                        lockdep_sock_is_held(sk));
1002         if (!md5sig)
1003                 return NULL;
1004 
1005         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1006                 if (key->family != family)
1007                         continue;
1008 
1009                 if (family == AF_INET) {
1010                         mask = inet_make_mask(key->prefixlen);
1011                         match = (key->addr.a4.s_addr & mask) ==
1012                                 (addr->a4.s_addr & mask);
1013 #if IS_ENABLED(CONFIG_IPV6)
1014                 } else if (family == AF_INET6) {
1015                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1016                                                   key->prefixlen);
1017 #endif
1018                 } else {
1019                         match = false;
1020                 }
1021 
1022                 if (match && (!best_match ||
1023                               key->prefixlen > best_match->prefixlen))
1024                         best_match = key;
1025         }
1026         return best_match;
1027 }
1028 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1029 
1030 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1031                                                       const union tcp_md5_addr *addr,
1032                                                       int family, u8 prefixlen)
1033 {
1034         const struct tcp_sock *tp = tcp_sk(sk);
1035         struct tcp_md5sig_key *key;
1036         unsigned int size = sizeof(struct in_addr);
1037         const struct tcp_md5sig_info *md5sig;
1038 
1039         /* caller either holds rcu_read_lock() or socket lock */
1040         md5sig = rcu_dereference_check(tp->md5sig_info,
1041                                        lockdep_sock_is_held(sk));
1042         if (!md5sig)
1043                 return NULL;
1044 #if IS_ENABLED(CONFIG_IPV6)
1045         if (family == AF_INET6)
1046                 size = sizeof(struct in6_addr);
1047 #endif
1048         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1049                 if (key->family != family)
1050                         continue;
1051                 if (!memcmp(&key->addr, addr, size) &&
1052                     key->prefixlen == prefixlen)
1053                         return key;
1054         }
1055         return NULL;
1056 }
1057 
1058 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1059                                          const struct sock *addr_sk)
1060 {
1061         const union tcp_md5_addr *addr;
1062 
1063         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1064         return tcp_md5_do_lookup(sk, addr, AF_INET);
1065 }
1066 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1067 
1068 /* This can be called on a newly created socket, from other files */
1069 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1070                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1071                    gfp_t gfp)
1072 {
1073         /* Add Key to the list */
1074         struct tcp_md5sig_key *key;
1075         struct tcp_sock *tp = tcp_sk(sk);
1076         struct tcp_md5sig_info *md5sig;
1077 
1078         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1079         if (key) {
1080                 /* Pre-existing entry - just update that one. */
1081                 memcpy(key->key, newkey, newkeylen);
1082                 key->keylen = newkeylen;
1083                 return 0;
1084         }
1085 
1086         md5sig = rcu_dereference_protected(tp->md5sig_info,
1087                                            lockdep_sock_is_held(sk));
1088         if (!md5sig) {
1089                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1090                 if (!md5sig)
1091                         return -ENOMEM;
1092 
1093                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1094                 INIT_HLIST_HEAD(&md5sig->head);
1095                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1096         }
1097 
1098         key = sock_kmalloc(sk, sizeof(*key), gfp);
1099         if (!key)
1100                 return -ENOMEM;
1101         if (!tcp_alloc_md5sig_pool()) {
1102                 sock_kfree_s(sk, key, sizeof(*key));
1103                 return -ENOMEM;
1104         }
1105 
1106         memcpy(key->key, newkey, newkeylen);
1107         key->keylen = newkeylen;
1108         key->family = family;
1109         key->prefixlen = prefixlen;
1110         memcpy(&key->addr, addr,
1111                (family == AF_INET6) ? sizeof(struct in6_addr) :
1112                                       sizeof(struct in_addr));
1113         hlist_add_head_rcu(&key->node, &md5sig->head);
1114         return 0;
1115 }
1116 EXPORT_SYMBOL(tcp_md5_do_add);
1117 
1118 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1119                    u8 prefixlen)
1120 {
1121         struct tcp_md5sig_key *key;
1122 
1123         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1124         if (!key)
1125                 return -ENOENT;
1126         hlist_del_rcu(&key->node);
1127         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1128         kfree_rcu(key, rcu);
1129         return 0;
1130 }
1131 EXPORT_SYMBOL(tcp_md5_do_del);
1132 
1133 static void tcp_clear_md5_list(struct sock *sk)
1134 {
1135         struct tcp_sock *tp = tcp_sk(sk);
1136         struct tcp_md5sig_key *key;
1137         struct hlist_node *n;
1138         struct tcp_md5sig_info *md5sig;
1139 
1140         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1141 
1142         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1143                 hlist_del_rcu(&key->node);
1144                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1145                 kfree_rcu(key, rcu);
1146         }
1147 }
1148 
1149 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1150                                  char __user *optval, int optlen)
1151 {
1152         struct tcp_md5sig cmd;
1153         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1154         u8 prefixlen = 32;
1155 
1156         if (optlen < sizeof(cmd))
1157                 return -EINVAL;
1158 
1159         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1160                 return -EFAULT;
1161 
1162         if (sin->sin_family != AF_INET)
1163                 return -EINVAL;
1164 
1165         if (optname == TCP_MD5SIG_EXT &&
1166             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1167                 prefixlen = cmd.tcpm_prefixlen;
1168                 if (prefixlen > 32)
1169                         return -EINVAL;
1170         }
1171 
1172         if (!cmd.tcpm_keylen)
1173                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1174                                       AF_INET, prefixlen);
1175 
1176         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1177                 return -EINVAL;
1178 
1179         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1180                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1181                               GFP_KERNEL);
1182 }
1183 
1184 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1185                                    __be32 daddr, __be32 saddr,
1186                                    const struct tcphdr *th, int nbytes)
1187 {
1188         struct tcp4_pseudohdr *bp;
1189         struct scatterlist sg;
1190         struct tcphdr *_th;
1191 
1192         bp = hp->scratch;
1193         bp->saddr = saddr;
1194         bp->daddr = daddr;
1195         bp->pad = 0;
1196         bp->protocol = IPPROTO_TCP;
1197         bp->len = cpu_to_be16(nbytes);
1198 
1199         _th = (struct tcphdr *)(bp + 1);
1200         memcpy(_th, th, sizeof(*th));
1201         _th->check = 0;
1202 
1203         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1204         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1205                                 sizeof(*bp) + sizeof(*th));
1206         return crypto_ahash_update(hp->md5_req);
1207 }
1208 
1209 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1210                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1211 {
1212         struct tcp_md5sig_pool *hp;
1213         struct ahash_request *req;
1214 
1215         hp = tcp_get_md5sig_pool();
1216         if (!hp)
1217                 goto clear_hash_noput;
1218         req = hp->md5_req;
1219 
1220         if (crypto_ahash_init(req))
1221                 goto clear_hash;
1222         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1223                 goto clear_hash;
1224         if (tcp_md5_hash_key(hp, key))
1225                 goto clear_hash;
1226         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1227         if (crypto_ahash_final(req))
1228                 goto clear_hash;
1229 
1230         tcp_put_md5sig_pool();
1231         return 0;
1232 
1233 clear_hash:
1234         tcp_put_md5sig_pool();
1235 clear_hash_noput:
1236         memset(md5_hash, 0, 16);
1237         return 1;
1238 }
1239 
1240 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1241                         const struct sock *sk,
1242                         const struct sk_buff *skb)
1243 {
1244         struct tcp_md5sig_pool *hp;
1245         struct ahash_request *req;
1246         const struct tcphdr *th = tcp_hdr(skb);
1247         __be32 saddr, daddr;
1248 
1249         if (sk) { /* valid for establish/request sockets */
1250                 saddr = sk->sk_rcv_saddr;
1251                 daddr = sk->sk_daddr;
1252         } else {
1253                 const struct iphdr *iph = ip_hdr(skb);
1254                 saddr = iph->saddr;
1255                 daddr = iph->daddr;
1256         }
1257 
1258         hp = tcp_get_md5sig_pool();
1259         if (!hp)
1260                 goto clear_hash_noput;
1261         req = hp->md5_req;
1262 
1263         if (crypto_ahash_init(req))
1264                 goto clear_hash;
1265 
1266         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1267                 goto clear_hash;
1268         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1269                 goto clear_hash;
1270         if (tcp_md5_hash_key(hp, key))
1271                 goto clear_hash;
1272         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1273         if (crypto_ahash_final(req))
1274                 goto clear_hash;
1275 
1276         tcp_put_md5sig_pool();
1277         return 0;
1278 
1279 clear_hash:
1280         tcp_put_md5sig_pool();
1281 clear_hash_noput:
1282         memset(md5_hash, 0, 16);
1283         return 1;
1284 }
1285 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1286 
1287 #endif
1288 
1289 /* Called with rcu_read_lock() */
1290 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1291                                     const struct sk_buff *skb)
1292 {
1293 #ifdef CONFIG_TCP_MD5SIG
1294         /*
1295          * This gets called for each TCP segment that arrives
1296          * so we want to be efficient.
1297          * We have 3 drop cases:
1298          * o No MD5 hash and one expected.
1299          * o MD5 hash and we're not expecting one.
1300          * o MD5 hash and its wrong.
1301          */
1302         const __u8 *hash_location = NULL;
1303         struct tcp_md5sig_key *hash_expected;
1304         const struct iphdr *iph = ip_hdr(skb);
1305         const struct tcphdr *th = tcp_hdr(skb);
1306         int genhash;
1307         unsigned char newhash[16];
1308 
1309         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1310                                           AF_INET);
1311         hash_location = tcp_parse_md5sig_option(th);
1312 
1313         /* We've parsed the options - do we have a hash? */
1314         if (!hash_expected && !hash_location)
1315                 return false;
1316 
1317         if (hash_expected && !hash_location) {
1318                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1319                 return true;
1320         }
1321 
1322         if (!hash_expected && hash_location) {
1323                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1324                 return true;
1325         }
1326 
1327         /* Okay, so this is hash_expected and hash_location -
1328          * so we need to calculate the checksum.
1329          */
1330         genhash = tcp_v4_md5_hash_skb(newhash,
1331                                       hash_expected,
1332                                       NULL, skb);
1333 
1334         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1335                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1336                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1337                                      &iph->saddr, ntohs(th->source),
1338                                      &iph->daddr, ntohs(th->dest),
1339                                      genhash ? " tcp_v4_calc_md5_hash failed"
1340                                      : "");
1341                 return true;
1342         }
1343         return false;
1344 #endif
1345         return false;
1346 }
1347 
1348 static void tcp_v4_init_req(struct request_sock *req,
1349                             const struct sock *sk_listener,
1350                             struct sk_buff *skb)
1351 {
1352         struct inet_request_sock *ireq = inet_rsk(req);
1353         struct net *net = sock_net(sk_listener);
1354 
1355         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1356         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1357         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1358 }
1359 
1360 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1361                                           struct flowi *fl,
1362                                           const struct request_sock *req)
1363 {
1364         return inet_csk_route_req(sk, &fl->u.ip4, req);
1365 }
1366 
1367 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1368         .family         =       PF_INET,
1369         .obj_size       =       sizeof(struct tcp_request_sock),
1370         .rtx_syn_ack    =       tcp_rtx_synack,
1371         .send_ack       =       tcp_v4_reqsk_send_ack,
1372         .destructor     =       tcp_v4_reqsk_destructor,
1373         .send_reset     =       tcp_v4_send_reset,
1374         .syn_ack_timeout =      tcp_syn_ack_timeout,
1375 };
1376 
1377 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1378         .mss_clamp      =       TCP_MSS_DEFAULT,
1379 #ifdef CONFIG_TCP_MD5SIG
1380         .req_md5_lookup =       tcp_v4_md5_lookup,
1381         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1382 #endif
1383         .init_req       =       tcp_v4_init_req,
1384 #ifdef CONFIG_SYN_COOKIES
1385         .cookie_init_seq =      cookie_v4_init_sequence,
1386 #endif
1387         .route_req      =       tcp_v4_route_req,
1388         .init_seq       =       tcp_v4_init_seq,
1389         .init_ts_off    =       tcp_v4_init_ts_off,
1390         .send_synack    =       tcp_v4_send_synack,
1391 };
1392 
1393 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1394 {
1395         /* Never answer to SYNs send to broadcast or multicast */
1396         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1397                 goto drop;
1398 
1399         return tcp_conn_request(&tcp_request_sock_ops,
1400                                 &tcp_request_sock_ipv4_ops, sk, skb);
1401 
1402 drop:
1403         tcp_listendrop(sk);
1404         return 0;
1405 }
1406 EXPORT_SYMBOL(tcp_v4_conn_request);
1407 
1408 
1409 /*
1410  * The three way handshake has completed - we got a valid synack -
1411  * now create the new socket.
1412  */
1413 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1414                                   struct request_sock *req,
1415                                   struct dst_entry *dst,
1416                                   struct request_sock *req_unhash,
1417                                   bool *own_req)
1418 {
1419         struct inet_request_sock *ireq;
1420         struct inet_sock *newinet;
1421         struct tcp_sock *newtp;
1422         struct sock *newsk;
1423 #ifdef CONFIG_TCP_MD5SIG
1424         struct tcp_md5sig_key *key;
1425 #endif
1426         struct ip_options_rcu *inet_opt;
1427 
1428         if (sk_acceptq_is_full(sk))
1429                 goto exit_overflow;
1430 
1431         newsk = tcp_create_openreq_child(sk, req, skb);
1432         if (!newsk)
1433                 goto exit_nonewsk;
1434 
1435         newsk->sk_gso_type = SKB_GSO_TCPV4;
1436         inet_sk_rx_dst_set(newsk, skb);
1437 
1438         newtp                 = tcp_sk(newsk);
1439         newinet               = inet_sk(newsk);
1440         ireq                  = inet_rsk(req);
1441         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1442         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1443         newsk->sk_bound_dev_if = ireq->ir_iif;
1444         newinet->inet_saddr   = ireq->ir_loc_addr;
1445         inet_opt              = rcu_dereference(ireq->ireq_opt);
1446         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1447         newinet->mc_index     = inet_iif(skb);
1448         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1449         newinet->rcv_tos      = ip_hdr(skb)->tos;
1450         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1451         if (inet_opt)
1452                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1453         newinet->inet_id = prandom_u32();
1454 
1455         if (!dst) {
1456                 dst = inet_csk_route_child_sock(sk, newsk, req);
1457                 if (!dst)
1458                         goto put_and_exit;
1459         } else {
1460                 /* syncookie case : see end of cookie_v4_check() */
1461         }
1462         sk_setup_caps(newsk, dst);
1463 
1464         tcp_ca_openreq_child(newsk, dst);
1465 
1466         tcp_sync_mss(newsk, dst_mtu(dst));
1467         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1468 
1469         tcp_initialize_rcv_mss(newsk);
1470 
1471 #ifdef CONFIG_TCP_MD5SIG
1472         /* Copy over the MD5 key from the original socket */
1473         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1474                                 AF_INET);
1475         if (key) {
1476                 /*
1477                  * We're using one, so create a matching key
1478                  * on the newsk structure. If we fail to get
1479                  * memory, then we end up not copying the key
1480                  * across. Shucks.
1481                  */
1482                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1483                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1484                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1485         }
1486 #endif
1487 
1488         if (__inet_inherit_port(sk, newsk) < 0)
1489                 goto put_and_exit;
1490         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1491         if (likely(*own_req)) {
1492                 tcp_move_syn(newtp, req);
1493                 ireq->ireq_opt = NULL;
1494         } else {
1495                 newinet->inet_opt = NULL;
1496         }
1497         return newsk;
1498 
1499 exit_overflow:
1500         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501 exit_nonewsk:
1502         dst_release(dst);
1503 exit:
1504         tcp_listendrop(sk);
1505         return NULL;
1506 put_and_exit:
1507         newinet->inet_opt = NULL;
1508         inet_csk_prepare_forced_close(newsk);
1509         tcp_done(newsk);
1510         goto exit;
1511 }
1512 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1513 
1514 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1515 {
1516 #ifdef CONFIG_SYN_COOKIES
1517         const struct tcphdr *th = tcp_hdr(skb);
1518 
1519         if (!th->syn)
1520                 sk = cookie_v4_check(sk, skb);
1521 #endif
1522         return sk;
1523 }
1524 
1525 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1526                          struct tcphdr *th, u32 *cookie)
1527 {
1528         u16 mss = 0;
1529 #ifdef CONFIG_SYN_COOKIES
1530         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1531                                     &tcp_request_sock_ipv4_ops, sk, th);
1532         if (mss) {
1533                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1534                 tcp_synq_overflow(sk);
1535         }
1536 #endif
1537         return mss;
1538 }
1539 
1540 /* The socket must have it's spinlock held when we get
1541  * here, unless it is a TCP_LISTEN socket.
1542  *
1543  * We have a potential double-lock case here, so even when
1544  * doing backlog processing we use the BH locking scheme.
1545  * This is because we cannot sleep with the original spinlock
1546  * held.
1547  */
1548 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549 {
1550         struct sock *rsk;
1551 
1552         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553                 struct dst_entry *dst = sk->sk_rx_dst;
1554 
1555                 sock_rps_save_rxhash(sk, skb);
1556                 sk_mark_napi_id(sk, skb);
1557                 if (dst) {
1558                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1559                             !dst->ops->check(dst, 0)) {
1560                                 dst_release(dst);
1561                                 sk->sk_rx_dst = NULL;
1562                         }
1563                 }
1564                 tcp_rcv_established(sk, skb);
1565                 return 0;
1566         }
1567 
1568         if (tcp_checksum_complete(skb))
1569                 goto csum_err;
1570 
1571         if (sk->sk_state == TCP_LISTEN) {
1572                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1573 
1574                 if (!nsk)
1575                         goto discard;
1576                 if (nsk != sk) {
1577                         if (tcp_child_process(sk, nsk, skb)) {
1578                                 rsk = nsk;
1579                                 goto reset;
1580                         }
1581                         return 0;
1582                 }
1583         } else
1584                 sock_rps_save_rxhash(sk, skb);
1585 
1586         if (tcp_rcv_state_process(sk, skb)) {
1587                 rsk = sk;
1588                 goto reset;
1589         }
1590         return 0;
1591 
1592 reset:
1593         tcp_v4_send_reset(rsk, skb);
1594 discard:
1595         kfree_skb(skb);
1596         /* Be careful here. If this function gets more complicated and
1597          * gcc suffers from register pressure on the x86, sk (in %ebx)
1598          * might be destroyed here. This current version compiles correctly,
1599          * but you have been warned.
1600          */
1601         return 0;
1602 
1603 csum_err:
1604         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1605         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1606         goto discard;
1607 }
1608 EXPORT_SYMBOL(tcp_v4_do_rcv);
1609 
1610 int tcp_v4_early_demux(struct sk_buff *skb)
1611 {
1612         const struct iphdr *iph;
1613         const struct tcphdr *th;
1614         struct sock *sk;
1615 
1616         if (skb->pkt_type != PACKET_HOST)
1617                 return 0;
1618 
1619         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1620                 return 0;
1621 
1622         iph = ip_hdr(skb);
1623         th = tcp_hdr(skb);
1624 
1625         if (th->doff < sizeof(struct tcphdr) / 4)
1626                 return 0;
1627 
1628         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1629                                        iph->saddr, th->source,
1630                                        iph->daddr, ntohs(th->dest),
1631                                        skb->skb_iif, inet_sdif(skb));
1632         if (sk) {
1633                 skb->sk = sk;
1634                 skb->destructor = sock_edemux;
1635                 if (sk_fullsock(sk)) {
1636                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1637 
1638                         if (dst)
1639                                 dst = dst_check(dst, 0);
1640                         if (dst &&
1641                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1642                                 skb_dst_set_noref(skb, dst);
1643                 }
1644         }
1645         return 0;
1646 }
1647 
1648 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1649 {
1650         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1651         struct skb_shared_info *shinfo;
1652         const struct tcphdr *th;
1653         struct tcphdr *thtail;
1654         struct sk_buff *tail;
1655         unsigned int hdrlen;
1656         bool fragstolen;
1657         u32 gso_segs;
1658         int delta;
1659 
1660         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1661          * we can fix skb->truesize to its real value to avoid future drops.
1662          * This is valid because skb is not yet charged to the socket.
1663          * It has been noticed pure SACK packets were sometimes dropped
1664          * (if cooked by drivers without copybreak feature).
1665          */
1666         skb_condense(skb);
1667 
1668         skb_dst_drop(skb);
1669 
1670         if (unlikely(tcp_checksum_complete(skb))) {
1671                 bh_unlock_sock(sk);
1672                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1673                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1674                 return true;
1675         }
1676 
1677         /* Attempt coalescing to last skb in backlog, even if we are
1678          * above the limits.
1679          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1680          */
1681         th = (const struct tcphdr *)skb->data;
1682         hdrlen = th->doff * 4;
1683         shinfo = skb_shinfo(skb);
1684 
1685         if (!shinfo->gso_size)
1686                 shinfo->gso_size = skb->len - hdrlen;
1687 
1688         if (!shinfo->gso_segs)
1689                 shinfo->gso_segs = 1;
1690 
1691         tail = sk->sk_backlog.tail;
1692         if (!tail)
1693                 goto no_coalesce;
1694         thtail = (struct tcphdr *)tail->data;
1695 
1696         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1697             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1698             ((TCP_SKB_CB(tail)->tcp_flags |
1699               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1700             !((TCP_SKB_CB(tail)->tcp_flags &
1701               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1702             ((TCP_SKB_CB(tail)->tcp_flags ^
1703               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1704 #ifdef CONFIG_TLS_DEVICE
1705             tail->decrypted != skb->decrypted ||
1706 #endif
1707             thtail->doff != th->doff ||
1708             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1709                 goto no_coalesce;
1710 
1711         __skb_pull(skb, hdrlen);
1712         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1713                 thtail->window = th->window;
1714 
1715                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1716 
1717                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1718                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1719 
1720                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1721                  * thtail->fin, so that the fast path in tcp_rcv_established()
1722                  * is not entered if we append a packet with a FIN.
1723                  * SYN, RST, URG are not present.
1724                  * ACK is set on both packets.
1725                  * PSH : we do not really care in TCP stack,
1726                  *       at least for 'GRO' packets.
1727                  */
1728                 thtail->fin |= th->fin;
1729                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1730 
1731                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1732                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1733                         tail->tstamp = skb->tstamp;
1734                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1735                 }
1736 
1737                 /* Not as strict as GRO. We only need to carry mss max value */
1738                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1739                                                  skb_shinfo(tail)->gso_size);
1740 
1741                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1742                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1743 
1744                 sk->sk_backlog.len += delta;
1745                 __NET_INC_STATS(sock_net(sk),
1746                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1747                 kfree_skb_partial(skb, fragstolen);
1748                 return false;
1749         }
1750         __skb_push(skb, hdrlen);
1751 
1752 no_coalesce:
1753         /* Only socket owner can try to collapse/prune rx queues
1754          * to reduce memory overhead, so add a little headroom here.
1755          * Few sockets backlog are possibly concurrently non empty.
1756          */
1757         limit += 64*1024;
1758 
1759         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1760                 bh_unlock_sock(sk);
1761                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1762                 return true;
1763         }
1764         return false;
1765 }
1766 EXPORT_SYMBOL(tcp_add_backlog);
1767 
1768 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1769 {
1770         struct tcphdr *th = (struct tcphdr *)skb->data;
1771 
1772         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1773 }
1774 EXPORT_SYMBOL(tcp_filter);
1775 
1776 static void tcp_v4_restore_cb(struct sk_buff *skb)
1777 {
1778         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1779                 sizeof(struct inet_skb_parm));
1780 }
1781 
1782 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1783                            const struct tcphdr *th)
1784 {
1785         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1786          * barrier() makes sure compiler wont play fool^Waliasing games.
1787          */
1788         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1789                 sizeof(struct inet_skb_parm));
1790         barrier();
1791 
1792         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1793         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1794                                     skb->len - th->doff * 4);
1795         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1796         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1797         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1798         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1799         TCP_SKB_CB(skb)->sacked  = 0;
1800         TCP_SKB_CB(skb)->has_rxtstamp =
1801                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1802 }
1803 
1804 /*
1805  *      From tcp_input.c
1806  */
1807 
1808 int tcp_v4_rcv(struct sk_buff *skb)
1809 {
1810         struct net *net = dev_net(skb->dev);
1811         struct sk_buff *skb_to_free;
1812         int sdif = inet_sdif(skb);
1813         const struct iphdr *iph;
1814         const struct tcphdr *th;
1815         bool refcounted;
1816         struct sock *sk;
1817         int ret;
1818 
1819         if (skb->pkt_type != PACKET_HOST)
1820                 goto discard_it;
1821 
1822         /* Count it even if it's bad */
1823         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1824 
1825         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1826                 goto discard_it;
1827 
1828         th = (const struct tcphdr *)skb->data;
1829 
1830         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1831                 goto bad_packet;
1832         if (!pskb_may_pull(skb, th->doff * 4))
1833                 goto discard_it;
1834 
1835         /* An explanation is required here, I think.
1836          * Packet length and doff are validated by header prediction,
1837          * provided case of th->doff==0 is eliminated.
1838          * So, we defer the checks. */
1839 
1840         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1841                 goto csum_error;
1842 
1843         th = (const struct tcphdr *)skb->data;
1844         iph = ip_hdr(skb);
1845 lookup:
1846         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1847                                th->dest, sdif, &refcounted);
1848         if (!sk)
1849                 goto no_tcp_socket;
1850 
1851 process:
1852         if (sk->sk_state == TCP_TIME_WAIT)
1853                 goto do_time_wait;
1854 
1855         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1856                 struct request_sock *req = inet_reqsk(sk);
1857                 bool req_stolen = false;
1858                 struct sock *nsk;
1859 
1860                 sk = req->rsk_listener;
1861                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1862                         sk_drops_add(sk, skb);
1863                         reqsk_put(req);
1864                         goto discard_it;
1865                 }
1866                 if (tcp_checksum_complete(skb)) {
1867                         reqsk_put(req);
1868                         goto csum_error;
1869                 }
1870                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1871                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1872                         goto lookup;
1873                 }
1874                 /* We own a reference on the listener, increase it again
1875                  * as we might lose it too soon.
1876                  */
1877                 sock_hold(sk);
1878                 refcounted = true;
1879                 nsk = NULL;
1880                 if (!tcp_filter(sk, skb)) {
1881                         th = (const struct tcphdr *)skb->data;
1882                         iph = ip_hdr(skb);
1883                         tcp_v4_fill_cb(skb, iph, th);
1884                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1885                 }
1886                 if (!nsk) {
1887                         reqsk_put(req);
1888                         if (req_stolen) {
1889                                 /* Another cpu got exclusive access to req
1890                                  * and created a full blown socket.
1891                                  * Try to feed this packet to this socket
1892                                  * instead of discarding it.
1893                                  */
1894                                 tcp_v4_restore_cb(skb);
1895                                 sock_put(sk);
1896                                 goto lookup;
1897                         }
1898                         goto discard_and_relse;
1899                 }
1900                 if (nsk == sk) {
1901                         reqsk_put(req);
1902                         tcp_v4_restore_cb(skb);
1903                 } else if (tcp_child_process(sk, nsk, skb)) {
1904                         tcp_v4_send_reset(nsk, skb);
1905                         goto discard_and_relse;
1906                 } else {
1907                         sock_put(sk);
1908                         return 0;
1909                 }
1910         }
1911         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1912                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1913                 goto discard_and_relse;
1914         }
1915 
1916         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1917                 goto discard_and_relse;
1918 
1919         if (tcp_v4_inbound_md5_hash(sk, skb))
1920                 goto discard_and_relse;
1921 
1922         nf_reset_ct(skb);
1923 
1924         if (tcp_filter(sk, skb))
1925                 goto discard_and_relse;
1926         th = (const struct tcphdr *)skb->data;
1927         iph = ip_hdr(skb);
1928         tcp_v4_fill_cb(skb, iph, th);
1929 
1930         skb->dev = NULL;
1931 
1932         if (sk->sk_state == TCP_LISTEN) {
1933                 ret = tcp_v4_do_rcv(sk, skb);
1934                 goto put_and_return;
1935         }
1936 
1937         sk_incoming_cpu_update(sk);
1938 
1939         bh_lock_sock_nested(sk);
1940         tcp_segs_in(tcp_sk(sk), skb);
1941         ret = 0;
1942         if (!sock_owned_by_user(sk)) {
1943                 skb_to_free = sk->sk_rx_skb_cache;
1944                 sk->sk_rx_skb_cache = NULL;
1945                 ret = tcp_v4_do_rcv(sk, skb);
1946         } else {
1947                 if (tcp_add_backlog(sk, skb))
1948                         goto discard_and_relse;
1949                 skb_to_free = NULL;
1950         }
1951         bh_unlock_sock(sk);
1952         if (skb_to_free)
1953                 __kfree_skb(skb_to_free);
1954 
1955 put_and_return:
1956         if (refcounted)
1957                 sock_put(sk);
1958 
1959         return ret;
1960 
1961 no_tcp_socket:
1962         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1963                 goto discard_it;
1964 
1965         tcp_v4_fill_cb(skb, iph, th);
1966 
1967         if (tcp_checksum_complete(skb)) {
1968 csum_error:
1969                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1970 bad_packet:
1971                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1972         } else {
1973                 tcp_v4_send_reset(NULL, skb);
1974         }
1975 
1976 discard_it:
1977         /* Discard frame. */
1978         kfree_skb(skb);
1979         return 0;
1980 
1981 discard_and_relse:
1982         sk_drops_add(sk, skb);
1983         if (refcounted)
1984                 sock_put(sk);
1985         goto discard_it;
1986 
1987 do_time_wait:
1988         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1989                 inet_twsk_put(inet_twsk(sk));
1990                 goto discard_it;
1991         }
1992 
1993         tcp_v4_fill_cb(skb, iph, th);
1994 
1995         if (tcp_checksum_complete(skb)) {
1996                 inet_twsk_put(inet_twsk(sk));
1997                 goto csum_error;
1998         }
1999         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2000         case TCP_TW_SYN: {
2001                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2002                                                         &tcp_hashinfo, skb,
2003                                                         __tcp_hdrlen(th),
2004                                                         iph->saddr, th->source,
2005                                                         iph->daddr, th->dest,
2006                                                         inet_iif(skb),
2007                                                         sdif);
2008                 if (sk2) {
2009                         inet_twsk_deschedule_put(inet_twsk(sk));
2010                         sk = sk2;
2011                         tcp_v4_restore_cb(skb);
2012                         refcounted = false;
2013                         goto process;
2014                 }
2015         }
2016                 /* to ACK */
2017                 /* fall through */
2018         case TCP_TW_ACK:
2019                 tcp_v4_timewait_ack(sk, skb);
2020                 break;
2021         case TCP_TW_RST:
2022                 tcp_v4_send_reset(sk, skb);
2023                 inet_twsk_deschedule_put(inet_twsk(sk));
2024                 goto discard_it;
2025         case TCP_TW_SUCCESS:;
2026         }
2027         goto discard_it;
2028 }
2029 
2030 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2031         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2032         .twsk_unique    = tcp_twsk_unique,
2033         .twsk_destructor= tcp_twsk_destructor,
2034 };
2035 
2036 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2037 {
2038         struct dst_entry *dst = skb_dst(skb);
2039 
2040         if (dst && dst_hold_safe(dst)) {
2041                 sk->sk_rx_dst = dst;
2042                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2043         }
2044 }
2045 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2046 
2047 const struct inet_connection_sock_af_ops ipv4_specific = {
2048         .queue_xmit        = ip_queue_xmit,
2049         .send_check        = tcp_v4_send_check,
2050         .rebuild_header    = inet_sk_rebuild_header,
2051         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2052         .conn_request      = tcp_v4_conn_request,
2053         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2054         .net_header_len    = sizeof(struct iphdr),
2055         .setsockopt        = ip_setsockopt,
2056         .getsockopt        = ip_getsockopt,
2057         .addr2sockaddr     = inet_csk_addr2sockaddr,
2058         .sockaddr_len      = sizeof(struct sockaddr_in),
2059 #ifdef CONFIG_COMPAT
2060         .compat_setsockopt = compat_ip_setsockopt,
2061         .compat_getsockopt = compat_ip_getsockopt,
2062 #endif
2063         .mtu_reduced       = tcp_v4_mtu_reduced,
2064 };
2065 EXPORT_SYMBOL(ipv4_specific);
2066 
2067 #ifdef CONFIG_TCP_MD5SIG
2068 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2069         .md5_lookup             = tcp_v4_md5_lookup,
2070         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2071         .md5_parse              = tcp_v4_parse_md5_keys,
2072 };
2073 #endif
2074 
2075 /* NOTE: A lot of things set to zero explicitly by call to
2076  *       sk_alloc() so need not be done here.
2077  */
2078 static int tcp_v4_init_sock(struct sock *sk)
2079 {
2080         struct inet_connection_sock *icsk = inet_csk(sk);
2081 
2082         tcp_init_sock(sk);
2083 
2084         icsk->icsk_af_ops = &ipv4_specific;
2085 
2086 #ifdef CONFIG_TCP_MD5SIG
2087         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2088 #endif
2089 
2090         return 0;
2091 }
2092 
2093 void tcp_v4_destroy_sock(struct sock *sk)
2094 {
2095         struct tcp_sock *tp = tcp_sk(sk);
2096 
2097         trace_tcp_destroy_sock(sk);
2098 
2099         tcp_clear_xmit_timers(sk);
2100 
2101         tcp_cleanup_congestion_control(sk);
2102 
2103         tcp_cleanup_ulp(sk);
2104 
2105         /* Cleanup up the write buffer. */
2106         tcp_write_queue_purge(sk);
2107 
2108         /* Check if we want to disable active TFO */
2109         tcp_fastopen_active_disable_ofo_check(sk);
2110 
2111         /* Cleans up our, hopefully empty, out_of_order_queue. */
2112         skb_rbtree_purge(&tp->out_of_order_queue);
2113 
2114 #ifdef CONFIG_TCP_MD5SIG
2115         /* Clean up the MD5 key list, if any */
2116         if (tp->md5sig_info) {
2117                 tcp_clear_md5_list(sk);
2118                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2119                 tp->md5sig_info = NULL;
2120         }
2121 #endif
2122 
2123         /* Clean up a referenced TCP bind bucket. */
2124         if (inet_csk(sk)->icsk_bind_hash)
2125                 inet_put_port(sk);
2126 
2127         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2128 
2129         /* If socket is aborted during connect operation */
2130         tcp_free_fastopen_req(tp);
2131         tcp_fastopen_destroy_cipher(sk);
2132         tcp_saved_syn_free(tp);
2133 
2134         sk_sockets_allocated_dec(sk);
2135 }
2136 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2137 
2138 #ifdef CONFIG_PROC_FS
2139 /* Proc filesystem TCP sock list dumping. */
2140 
2141 /*
2142  * Get next listener socket follow cur.  If cur is NULL, get first socket
2143  * starting from bucket given in st->bucket; when st->bucket is zero the
2144  * very first socket in the hash table is returned.
2145  */
2146 static void *listening_get_next(struct seq_file *seq, void *cur)
2147 {
2148         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2149         struct tcp_iter_state *st = seq->private;
2150         struct net *net = seq_file_net(seq);
2151         struct inet_listen_hashbucket *ilb;
2152         struct hlist_nulls_node *node;
2153         struct sock *sk = cur;
2154 
2155         if (!sk) {
2156 get_head:
2157                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2158                 spin_lock(&ilb->lock);
2159                 sk = sk_nulls_head(&ilb->nulls_head);
2160                 st->offset = 0;
2161                 goto get_sk;
2162         }
2163         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2164         ++st->num;
2165         ++st->offset;
2166 
2167         sk = sk_nulls_next(sk);
2168 get_sk:
2169         sk_nulls_for_each_from(sk, node) {
2170                 if (!net_eq(sock_net(sk), net))
2171                         continue;
2172                 if (sk->sk_family == afinfo->family)
2173                         return sk;
2174         }
2175         spin_unlock(&ilb->lock);
2176         st->offset = 0;
2177         if (++st->bucket < INET_LHTABLE_SIZE)
2178                 goto get_head;
2179         return NULL;
2180 }
2181 
2182 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2183 {
2184         struct tcp_iter_state *st = seq->private;
2185         void *rc;
2186 
2187         st->bucket = 0;
2188         st->offset = 0;
2189         rc = listening_get_next(seq, NULL);
2190 
2191         while (rc && *pos) {
2192                 rc = listening_get_next(seq, rc);
2193                 --*pos;
2194         }
2195         return rc;
2196 }
2197 
2198 static inline bool empty_bucket(const struct tcp_iter_state *st)
2199 {
2200         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2201 }
2202 
2203 /*
2204  * Get first established socket starting from bucket given in st->bucket.
2205  * If st->bucket is zero, the very first socket in the hash is returned.
2206  */
2207 static void *established_get_first(struct seq_file *seq)
2208 {
2209         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2210         struct tcp_iter_state *st = seq->private;
2211         struct net *net = seq_file_net(seq);
2212         void *rc = NULL;
2213 
2214         st->offset = 0;
2215         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2216                 struct sock *sk;
2217                 struct hlist_nulls_node *node;
2218                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2219 
2220                 /* Lockless fast path for the common case of empty buckets */
2221                 if (empty_bucket(st))
2222                         continue;
2223 
2224                 spin_lock_bh(lock);
2225                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2226                         if (sk->sk_family != afinfo->family ||
2227                             !net_eq(sock_net(sk), net)) {
2228                                 continue;
2229                         }
2230                         rc = sk;
2231                         goto out;
2232                 }
2233                 spin_unlock_bh(lock);
2234         }
2235 out:
2236         return rc;
2237 }
2238 
2239 static void *established_get_next(struct seq_file *seq, void *cur)
2240 {
2241         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2242         struct sock *sk = cur;
2243         struct hlist_nulls_node *node;
2244         struct tcp_iter_state *st = seq->private;
2245         struct net *net = seq_file_net(seq);
2246 
2247         ++st->num;
2248         ++st->offset;
2249 
2250         sk = sk_nulls_next(sk);
2251 
2252         sk_nulls_for_each_from(sk, node) {
2253                 if (sk->sk_family == afinfo->family &&
2254                     net_eq(sock_net(sk), net))
2255                         return sk;
2256         }
2257 
2258         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2259         ++st->bucket;
2260         return established_get_first(seq);
2261 }
2262 
2263 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2264 {
2265         struct tcp_iter_state *st = seq->private;
2266         void *rc;
2267 
2268         st->bucket = 0;
2269         rc = established_get_first(seq);
2270 
2271         while (rc && pos) {
2272                 rc = established_get_next(seq, rc);
2273                 --pos;
2274         }
2275         return rc;
2276 }
2277 
2278 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2279 {
2280         void *rc;
2281         struct tcp_iter_state *st = seq->private;
2282 
2283         st->state = TCP_SEQ_STATE_LISTENING;
2284         rc        = listening_get_idx(seq, &pos);
2285 
2286         if (!rc) {
2287                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2288                 rc        = established_get_idx(seq, pos);
2289         }
2290 
2291         return rc;
2292 }
2293 
2294 static void *tcp_seek_last_pos(struct seq_file *seq)
2295 {
2296         struct tcp_iter_state *st = seq->private;
2297         int offset = st->offset;
2298         int orig_num = st->num;
2299         void *rc = NULL;
2300 
2301         switch (st->state) {
2302         case TCP_SEQ_STATE_LISTENING:
2303                 if (st->bucket >= INET_LHTABLE_SIZE)
2304                         break;
2305                 st->state = TCP_SEQ_STATE_LISTENING;
2306                 rc = listening_get_next(seq, NULL);
2307                 while (offset-- && rc)
2308                         rc = listening_get_next(seq, rc);
2309                 if (rc)
2310                         break;
2311                 st->bucket = 0;
2312                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2313                 /* Fallthrough */
2314         case TCP_SEQ_STATE_ESTABLISHED:
2315                 if (st->bucket > tcp_hashinfo.ehash_mask)
2316                         break;
2317                 rc = established_get_first(seq);
2318                 while (offset-- && rc)
2319                         rc = established_get_next(seq, rc);
2320         }
2321 
2322         st->num = orig_num;
2323 
2324         return rc;
2325 }
2326 
2327 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2328 {
2329         struct tcp_iter_state *st = seq->private;
2330         void *rc;
2331 
2332         if (*pos && *pos == st->last_pos) {
2333                 rc = tcp_seek_last_pos(seq);
2334                 if (rc)
2335                         goto out;
2336         }
2337 
2338         st->state = TCP_SEQ_STATE_LISTENING;
2339         st->num = 0;
2340         st->bucket = 0;
2341         st->offset = 0;
2342         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2343 
2344 out:
2345         st->last_pos = *pos;
2346         return rc;
2347 }
2348 EXPORT_SYMBOL(tcp_seq_start);
2349 
2350 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2351 {
2352         struct tcp_iter_state *st = seq->private;
2353         void *rc = NULL;
2354 
2355         if (v == SEQ_START_TOKEN) {
2356                 rc = tcp_get_idx(seq, 0);
2357                 goto out;
2358         }
2359 
2360         switch (st->state) {
2361         case TCP_SEQ_STATE_LISTENING:
2362                 rc = listening_get_next(seq, v);
2363                 if (!rc) {
2364                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2365                         st->bucket = 0;
2366                         st->offset = 0;
2367                         rc        = established_get_first(seq);
2368                 }
2369                 break;
2370         case TCP_SEQ_STATE_ESTABLISHED:
2371                 rc = established_get_next(seq, v);
2372                 break;
2373         }
2374 out:
2375         ++*pos;
2376         st->last_pos = *pos;
2377         return rc;
2378 }
2379 EXPORT_SYMBOL(tcp_seq_next);
2380 
2381 void tcp_seq_stop(struct seq_file *seq, void *v)
2382 {
2383         struct tcp_iter_state *st = seq->private;
2384 
2385         switch (st->state) {
2386         case TCP_SEQ_STATE_LISTENING:
2387                 if (v != SEQ_START_TOKEN)
2388                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2389                 break;
2390         case TCP_SEQ_STATE_ESTABLISHED:
2391                 if (v)
2392                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2393                 break;
2394         }
2395 }
2396 EXPORT_SYMBOL(tcp_seq_stop);
2397 
2398 static void get_openreq4(const struct request_sock *req,
2399                          struct seq_file *f, int i)
2400 {
2401         const struct inet_request_sock *ireq = inet_rsk(req);
2402         long delta = req->rsk_timer.expires - jiffies;
2403 
2404         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2405                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2406                 i,
2407                 ireq->ir_loc_addr,
2408                 ireq->ir_num,
2409                 ireq->ir_rmt_addr,
2410                 ntohs(ireq->ir_rmt_port),
2411                 TCP_SYN_RECV,
2412                 0, 0, /* could print option size, but that is af dependent. */
2413                 1,    /* timers active (only the expire timer) */
2414                 jiffies_delta_to_clock_t(delta),
2415                 req->num_timeout,
2416                 from_kuid_munged(seq_user_ns(f),
2417                                  sock_i_uid(req->rsk_listener)),
2418                 0,  /* non standard timer */
2419                 0, /* open_requests have no inode */
2420                 0,
2421                 req);
2422 }
2423 
2424 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2425 {
2426         int timer_active;
2427         unsigned long timer_expires;
2428         const struct tcp_sock *tp = tcp_sk(sk);
2429         const struct inet_connection_sock *icsk = inet_csk(sk);
2430         const struct inet_sock *inet = inet_sk(sk);
2431         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2432         __be32 dest = inet->inet_daddr;
2433         __be32 src = inet->inet_rcv_saddr;
2434         __u16 destp = ntohs(inet->inet_dport);
2435         __u16 srcp = ntohs(inet->inet_sport);
2436         int rx_queue;
2437         int state;
2438 
2439         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2440             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2441             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2442                 timer_active    = 1;
2443                 timer_expires   = icsk->icsk_timeout;
2444         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2445                 timer_active    = 4;
2446                 timer_expires   = icsk->icsk_timeout;
2447         } else if (timer_pending(&sk->sk_timer)) {
2448                 timer_active    = 2;
2449                 timer_expires   = sk->sk_timer.expires;
2450         } else {
2451                 timer_active    = 0;
2452                 timer_expires = jiffies;
2453         }
2454 
2455         state = inet_sk_state_load(sk);
2456         if (state == TCP_LISTEN)
2457                 rx_queue = sk->sk_ack_backlog;
2458         else
2459                 /* Because we don't lock the socket,
2460                  * we might find a transient negative value.
2461                  */
2462                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2463                                       READ_ONCE(tp->copied_seq), 0);
2464 
2465         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2466                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2467                 i, src, srcp, dest, destp, state,
2468                 READ_ONCE(tp->write_seq) - tp->snd_una,
2469                 rx_queue,
2470                 timer_active,
2471                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2472                 icsk->icsk_retransmits,
2473                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2474                 icsk->icsk_probes_out,
2475                 sock_i_ino(sk),
2476                 refcount_read(&sk->sk_refcnt), sk,
2477                 jiffies_to_clock_t(icsk->icsk_rto),
2478                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2479                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2480                 tp->snd_cwnd,
2481                 state == TCP_LISTEN ?
2482                     fastopenq->max_qlen :
2483                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2484 }
2485 
2486 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2487                                struct seq_file *f, int i)
2488 {
2489         long delta = tw->tw_timer.expires - jiffies;
2490         __be32 dest, src;
2491         __u16 destp, srcp;
2492 
2493         dest  = tw->tw_daddr;
2494         src   = tw->tw_rcv_saddr;
2495         destp = ntohs(tw->tw_dport);
2496         srcp  = ntohs(tw->tw_sport);
2497 
2498         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2499                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2500                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2501                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2502                 refcount_read(&tw->tw_refcnt), tw);
2503 }
2504 
2505 #define TMPSZ 150
2506 
2507 static int tcp4_seq_show(struct seq_file *seq, void *v)
2508 {
2509         struct tcp_iter_state *st;
2510         struct sock *sk = v;
2511 
2512         seq_setwidth(seq, TMPSZ - 1);
2513         if (v == SEQ_START_TOKEN) {
2514                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2515                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2516                            "inode");
2517                 goto out;
2518         }
2519         st = seq->private;
2520 
2521         if (sk->sk_state == TCP_TIME_WAIT)
2522                 get_timewait4_sock(v, seq, st->num);
2523         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2524                 get_openreq4(v, seq, st->num);
2525         else
2526                 get_tcp4_sock(v, seq, st->num);
2527 out:
2528         seq_pad(seq, '\n');
2529         return 0;
2530 }
2531 
2532 static const struct seq_operations tcp4_seq_ops = {
2533         .show           = tcp4_seq_show,
2534         .start          = tcp_seq_start,
2535         .next           = tcp_seq_next,
2536         .stop           = tcp_seq_stop,
2537 };
2538 
2539 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2540         .family         = AF_INET,
2541 };
2542 
2543 static int __net_init tcp4_proc_init_net(struct net *net)
2544 {
2545         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2546                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2547                 return -ENOMEM;
2548         return 0;
2549 }
2550 
2551 static void __net_exit tcp4_proc_exit_net(struct net *net)
2552 {
2553         remove_proc_entry("tcp", net->proc_net);
2554 }
2555 
2556 static struct pernet_operations tcp4_net_ops = {
2557         .init = tcp4_proc_init_net,
2558         .exit = tcp4_proc_exit_net,
2559 };
2560 
2561 int __init tcp4_proc_init(void)
2562 {
2563         return register_pernet_subsys(&tcp4_net_ops);
2564 }
2565 
2566 void tcp4_proc_exit(void)
2567 {
2568         unregister_pernet_subsys(&tcp4_net_ops);
2569 }
2570 #endif /* CONFIG_PROC_FS */
2571 
2572 struct proto tcp_prot = {
2573         .name                   = "TCP",
2574         .owner                  = THIS_MODULE,
2575         .close                  = tcp_close,
2576         .pre_connect            = tcp_v4_pre_connect,
2577         .connect                = tcp_v4_connect,
2578         .disconnect             = tcp_disconnect,
2579         .accept                 = inet_csk_accept,
2580         .ioctl                  = tcp_ioctl,
2581         .init                   = tcp_v4_init_sock,
2582         .destroy                = tcp_v4_destroy_sock,
2583         .shutdown               = tcp_shutdown,
2584         .setsockopt             = tcp_setsockopt,
2585         .getsockopt             = tcp_getsockopt,
2586         .keepalive              = tcp_set_keepalive,
2587         .recvmsg                = tcp_recvmsg,
2588         .sendmsg                = tcp_sendmsg,
2589         .sendpage               = tcp_sendpage,
2590         .backlog_rcv            = tcp_v4_do_rcv,
2591         .release_cb             = tcp_release_cb,
2592         .hash                   = inet_hash,
2593         .unhash                 = inet_unhash,
2594         .get_port               = inet_csk_get_port,
2595         .enter_memory_pressure  = tcp_enter_memory_pressure,
2596         .leave_memory_pressure  = tcp_leave_memory_pressure,
2597         .stream_memory_free     = tcp_stream_memory_free,
2598         .sockets_allocated      = &tcp_sockets_allocated,
2599         .orphan_count           = &tcp_orphan_count,
2600         .memory_allocated       = &tcp_memory_allocated,
2601         .memory_pressure        = &tcp_memory_pressure,
2602         .sysctl_mem             = sysctl_tcp_mem,
2603         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2604         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2605         .max_header             = MAX_TCP_HEADER,
2606         .obj_size               = sizeof(struct tcp_sock),
2607         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2608         .twsk_prot              = &tcp_timewait_sock_ops,
2609         .rsk_prot               = &tcp_request_sock_ops,
2610         .h.hashinfo             = &tcp_hashinfo,
2611         .no_autobind            = true,
2612 #ifdef CONFIG_COMPAT
2613         .compat_setsockopt      = compat_tcp_setsockopt,
2614         .compat_getsockopt      = compat_tcp_getsockopt,
2615 #endif
2616         .diag_destroy           = tcp_abort,
2617 };
2618 EXPORT_SYMBOL(tcp_prot);
2619 
2620 static void __net_exit tcp_sk_exit(struct net *net)
2621 {
2622         int cpu;
2623 
2624         if (net->ipv4.tcp_congestion_control)
2625                 module_put(net->ipv4.tcp_congestion_control->owner);
2626 
2627         for_each_possible_cpu(cpu)
2628                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2629         free_percpu(net->ipv4.tcp_sk);
2630 }
2631 
2632 static int __net_init tcp_sk_init(struct net *net)
2633 {
2634         int res, cpu, cnt;
2635 
2636         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2637         if (!net->ipv4.tcp_sk)
2638                 return -ENOMEM;
2639 
2640         for_each_possible_cpu(cpu) {
2641                 struct sock *sk;
2642 
2643                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2644                                            IPPROTO_TCP, net);
2645                 if (res)
2646                         goto fail;
2647                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2648 
2649                 /* Please enforce IP_DF and IPID==0 for RST and
2650                  * ACK sent in SYN-RECV and TIME-WAIT state.
2651                  */
2652                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2653 
2654                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2655         }
2656 
2657         net->ipv4.sysctl_tcp_ecn = 2;
2658         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2659 
2660         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2661         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2662         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2663         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2664         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2665 
2666         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2667         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2668         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2669 
2670         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2671         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2672         net->ipv4.sysctl_tcp_syncookies = 1;
2673         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2674         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2675         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2676         net->ipv4.sysctl_tcp_orphan_retries = 0;
2677         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2678         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2679         net->ipv4.sysctl_tcp_tw_reuse = 2;
2680 
2681         cnt = tcp_hashinfo.ehash_mask + 1;
2682         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2683         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2684 
2685         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2686         net->ipv4.sysctl_tcp_sack = 1;
2687         net->ipv4.sysctl_tcp_window_scaling = 1;
2688         net->ipv4.sysctl_tcp_timestamps = 1;
2689         net->ipv4.sysctl_tcp_early_retrans = 3;
2690         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2691         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2692         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2693         net->ipv4.sysctl_tcp_max_reordering = 300;
2694         net->ipv4.sysctl_tcp_dsack = 1;
2695         net->ipv4.sysctl_tcp_app_win = 31;
2696         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2697         net->ipv4.sysctl_tcp_frto = 2;
2698         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2699         /* This limits the percentage of the congestion window which we
2700          * will allow a single TSO frame to consume.  Building TSO frames
2701          * which are too large can cause TCP streams to be bursty.
2702          */
2703         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2704         /* Default TSQ limit of 16 TSO segments */
2705         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2706         /* rfc5961 challenge ack rate limiting */
2707         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2708         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2709         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2710         net->ipv4.sysctl_tcp_autocorking = 1;
2711         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2712         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2713         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2714         if (net != &init_net) {
2715                 memcpy(net->ipv4.sysctl_tcp_rmem,
2716                        init_net.ipv4.sysctl_tcp_rmem,
2717                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2718                 memcpy(net->ipv4.sysctl_tcp_wmem,
2719                        init_net.ipv4.sysctl_tcp_wmem,
2720                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2721         }
2722         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2723         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2724         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2725         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2726         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2727         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2728 
2729         /* Reno is always built in */
2730         if (!net_eq(net, &init_net) &&
2731             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2732                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2733         else
2734                 net->ipv4.tcp_congestion_control = &tcp_reno;
2735 
2736         return 0;
2737 fail:
2738         tcp_sk_exit(net);
2739 
2740         return res;
2741 }
2742 
2743 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2744 {
2745         struct net *net;
2746 
2747         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2748 
2749         list_for_each_entry(net, net_exit_list, exit_list)
2750                 tcp_fastopen_ctx_destroy(net);
2751 }
2752 
2753 static struct pernet_operations __net_initdata tcp_sk_ops = {
2754        .init       = tcp_sk_init,
2755        .exit       = tcp_sk_exit,
2756        .exit_batch = tcp_sk_exit_batch,
2757 };
2758 
2759 void __init tcp_v4_init(void)
2760 {
2761         if (register_pernet_subsys(&tcp_sk_ops))
2762                 panic("Failed to create the TCP control socket.\n");
2763 }

/* [<][>][^][v][top][bottom][index][help] */