1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
tcp_v4_init_sequence(const struct sk_buff * skb)100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (!twp || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	sk_rcv_saddr_set(sk, inet->inet_saddr);
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
tcp_v4_mtu_reduced(struct sock * sk)272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
do_redirect(struct sk_buff * skb,struct sock * sk)305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
316 {
317 	struct request_sock *req = inet_reqsk(sk);
318 	struct net *net = sock_net(sk);
319 
320 	/* ICMPs are not backlogged, hence we cannot get
321 	 * an established socket here.
322 	 */
323 	if (seq != tcp_rsk(req)->snt_isn) {
324 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
325 	} else if (abort) {
326 		/*
327 		 * Still in SYN_RECV, just remove it silently.
328 		 * There is no good way to pass the error to the newly
329 		 * created socket, and POSIX does not want network
330 		 * errors returned from accept().
331 		 */
332 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
333 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
334 	}
335 	reqsk_put(req);
336 }
337 EXPORT_SYMBOL(tcp_req_err);
338 
339 /*
340  * This routine is called by the ICMP module when it gets some
341  * sort of error condition.  If err < 0 then the socket should
342  * be closed and the error returned to the user.  If err > 0
343  * it's just the icmp type << 8 | icmp code.  After adjustment
344  * header points to the first 8 bytes of the tcp header.  We need
345  * to find the appropriate port.
346  *
347  * The locking strategy used here is very "optimistic". When
348  * someone else accesses the socket the ICMP is just dropped
349  * and for some paths there is no check at all.
350  * A more general error queue to queue errors for later handling
351  * is probably better.
352  *
353  */
354 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)355 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
356 {
357 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
358 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
359 	struct inet_connection_sock *icsk;
360 	struct tcp_sock *tp;
361 	struct inet_sock *inet;
362 	const int type = icmp_hdr(icmp_skb)->type;
363 	const int code = icmp_hdr(icmp_skb)->code;
364 	struct sock *sk;
365 	struct sk_buff *skb;
366 	struct request_sock *fastopen;
367 	__u32 seq, snd_una;
368 	__u32 remaining;
369 	int err;
370 	struct net *net = dev_net(icmp_skb->dev);
371 
372 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
373 				       th->dest, iph->saddr, ntohs(th->source),
374 				       inet_iif(icmp_skb));
375 	if (!sk) {
376 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
377 		return;
378 	}
379 	if (sk->sk_state == TCP_TIME_WAIT) {
380 		inet_twsk_put(inet_twsk(sk));
381 		return;
382 	}
383 	seq = ntohl(th->seq);
384 	if (sk->sk_state == TCP_NEW_SYN_RECV)
385 		return tcp_req_err(sk, seq,
386 				  type == ICMP_PARAMETERPROB ||
387 				  type == ICMP_TIME_EXCEEDED ||
388 				  (type == ICMP_DEST_UNREACH &&
389 				   (code == ICMP_NET_UNREACH ||
390 				    code == ICMP_HOST_UNREACH)));
391 
392 	bh_lock_sock(sk);
393 	/* If too many ICMPs get dropped on busy
394 	 * servers this needs to be solved differently.
395 	 * We do take care of PMTU discovery (RFC1191) special case :
396 	 * we can receive locally generated ICMP messages while socket is held.
397 	 */
398 	if (sock_owned_by_user(sk)) {
399 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
400 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
401 	}
402 	if (sk->sk_state == TCP_CLOSE)
403 		goto out;
404 
405 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
406 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
407 		goto out;
408 	}
409 
410 	icsk = inet_csk(sk);
411 	tp = tcp_sk(sk);
412 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
413 	fastopen = tp->fastopen_rsk;
414 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
415 	if (sk->sk_state != TCP_LISTEN &&
416 	    !between(seq, snd_una, tp->snd_nxt)) {
417 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
418 		goto out;
419 	}
420 
421 	switch (type) {
422 	case ICMP_REDIRECT:
423 		do_redirect(icmp_skb, sk);
424 		goto out;
425 	case ICMP_SOURCE_QUENCH:
426 		/* Just silently ignore these. */
427 		goto out;
428 	case ICMP_PARAMETERPROB:
429 		err = EPROTO;
430 		break;
431 	case ICMP_DEST_UNREACH:
432 		if (code > NR_ICMP_UNREACH)
433 			goto out;
434 
435 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
436 			/* We are not interested in TCP_LISTEN and open_requests
437 			 * (SYN-ACKs send out by Linux are always <576bytes so
438 			 * they should go through unfragmented).
439 			 */
440 			if (sk->sk_state == TCP_LISTEN)
441 				goto out;
442 
443 			tp->mtu_info = info;
444 			if (!sock_owned_by_user(sk)) {
445 				tcp_v4_mtu_reduced(sk);
446 			} else {
447 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
448 					sock_hold(sk);
449 			}
450 			goto out;
451 		}
452 
453 		err = icmp_err_convert[code].errno;
454 		/* check if icmp_skb allows revert of backoff
455 		 * (see draft-zimmermann-tcp-lcd) */
456 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
457 			break;
458 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
459 		    !icsk->icsk_backoff || fastopen)
460 			break;
461 
462 		if (sock_owned_by_user(sk))
463 			break;
464 
465 		icsk->icsk_backoff--;
466 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
467 					       TCP_TIMEOUT_INIT;
468 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
469 
470 		skb = tcp_write_queue_head(sk);
471 		BUG_ON(!skb);
472 
473 		remaining = icsk->icsk_rto -
474 			    min(icsk->icsk_rto,
475 				tcp_time_stamp - tcp_skb_timestamp(skb));
476 
477 		if (remaining) {
478 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
479 						  remaining, TCP_RTO_MAX);
480 		} else {
481 			/* RTO revert clocked out retransmission.
482 			 * Will retransmit now */
483 			tcp_retransmit_timer(sk);
484 		}
485 
486 		break;
487 	case ICMP_TIME_EXCEEDED:
488 		err = EHOSTUNREACH;
489 		break;
490 	default:
491 		goto out;
492 	}
493 
494 	switch (sk->sk_state) {
495 	case TCP_SYN_SENT:
496 	case TCP_SYN_RECV:
497 		/* Only in fast or simultaneous open. If a fast open socket is
498 		 * is already accepted it is treated as a connected one below.
499 		 */
500 		if (fastopen && !fastopen->sk)
501 			break;
502 
503 		if (!sock_owned_by_user(sk)) {
504 			sk->sk_err = err;
505 
506 			sk->sk_error_report(sk);
507 
508 			tcp_done(sk);
509 		} else {
510 			sk->sk_err_soft = err;
511 		}
512 		goto out;
513 	}
514 
515 	/* If we've already connected we will keep trying
516 	 * until we time out, or the user gives up.
517 	 *
518 	 * rfc1122 4.2.3.9 allows to consider as hard errors
519 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
520 	 * but it is obsoleted by pmtu discovery).
521 	 *
522 	 * Note, that in modern internet, where routing is unreliable
523 	 * and in each dark corner broken firewalls sit, sending random
524 	 * errors ordered by their masters even this two messages finally lose
525 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
526 	 *
527 	 * Now we are in compliance with RFCs.
528 	 *							--ANK (980905)
529 	 */
530 
531 	inet = inet_sk(sk);
532 	if (!sock_owned_by_user(sk) && inet->recverr) {
533 		sk->sk_err = err;
534 		sk->sk_error_report(sk);
535 	} else	{ /* Only an error on timeout */
536 		sk->sk_err_soft = err;
537 	}
538 
539 out:
540 	bh_unlock_sock(sk);
541 	sock_put(sk);
542 }
543 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)544 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
545 {
546 	struct tcphdr *th = tcp_hdr(skb);
547 
548 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
549 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
550 		skb->csum_start = skb_transport_header(skb) - skb->head;
551 		skb->csum_offset = offsetof(struct tcphdr, check);
552 	} else {
553 		th->check = tcp_v4_check(skb->len, saddr, daddr,
554 					 csum_partial(th,
555 						      th->doff << 2,
556 						      skb->csum));
557 	}
558 }
559 
560 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)561 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
562 {
563 	const struct inet_sock *inet = inet_sk(sk);
564 
565 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
566 }
567 EXPORT_SYMBOL(tcp_v4_send_check);
568 
569 /*
570  *	This routine will send an RST to the other tcp.
571  *
572  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
573  *		      for reset.
574  *	Answer: if a packet caused RST, it is not for a socket
575  *		existing in our system, if it is matched to a socket,
576  *		it is just duplicate segment or bug in other side's TCP.
577  *		So that we build reply only basing on parameters
578  *		arrived with segment.
579  *	Exception: precedence violation. We do not implement it in any case.
580  */
581 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
583 {
584 	const struct tcphdr *th = tcp_hdr(skb);
585 	struct {
586 		struct tcphdr th;
587 #ifdef CONFIG_TCP_MD5SIG
588 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
589 #endif
590 	} rep;
591 	struct ip_reply_arg arg;
592 #ifdef CONFIG_TCP_MD5SIG
593 	struct tcp_md5sig_key *key;
594 	const __u8 *hash_location = NULL;
595 	unsigned char newhash[16];
596 	int genhash;
597 	struct sock *sk1 = NULL;
598 #endif
599 	struct net *net;
600 
601 	/* Never send a reset in response to a reset. */
602 	if (th->rst)
603 		return;
604 
605 	/* If sk not NULL, it means we did a successful lookup and incoming
606 	 * route had to be correct. prequeue might have dropped our dst.
607 	 */
608 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
609 		return;
610 
611 	/* Swap the send and the receive. */
612 	memset(&rep, 0, sizeof(rep));
613 	rep.th.dest   = th->source;
614 	rep.th.source = th->dest;
615 	rep.th.doff   = sizeof(struct tcphdr) / 4;
616 	rep.th.rst    = 1;
617 
618 	if (th->ack) {
619 		rep.th.seq = th->ack_seq;
620 	} else {
621 		rep.th.ack = 1;
622 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623 				       skb->len - (th->doff << 2));
624 	}
625 
626 	memset(&arg, 0, sizeof(arg));
627 	arg.iov[0].iov_base = (unsigned char *)&rep;
628 	arg.iov[0].iov_len  = sizeof(rep.th);
629 
630 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
631 #ifdef CONFIG_TCP_MD5SIG
632 	hash_location = tcp_parse_md5sig_option(th);
633 	if (!sk && hash_location) {
634 		/*
635 		 * active side is lost. Try to find listening socket through
636 		 * source port, and then find md5 key through listening socket.
637 		 * we are not loose security here:
638 		 * Incoming packet is checked with md5 hash with finding key,
639 		 * no RST generated if md5 hash doesn't match.
640 		 */
641 		sk1 = __inet_lookup_listener(net,
642 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
643 					     th->source, ip_hdr(skb)->daddr,
644 					     ntohs(th->source), inet_iif(skb));
645 		/* don't send rst if it can't find key */
646 		if (!sk1)
647 			return;
648 		rcu_read_lock();
649 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650 					&ip_hdr(skb)->saddr, AF_INET);
651 		if (!key)
652 			goto release_sk1;
653 
654 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
656 			goto release_sk1;
657 	} else {
658 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
659 					     &ip_hdr(skb)->saddr,
660 					     AF_INET) : NULL;
661 	}
662 
663 	if (key) {
664 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665 				   (TCPOPT_NOP << 16) |
666 				   (TCPOPT_MD5SIG << 8) |
667 				   TCPOLEN_MD5SIG);
668 		/* Update length and the length the header thinks exists */
669 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670 		rep.th.doff = arg.iov[0].iov_len / 4;
671 
672 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673 				     key, ip_hdr(skb)->saddr,
674 				     ip_hdr(skb)->daddr, &rep.th);
675 	}
676 #endif
677 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678 				      ip_hdr(skb)->saddr, /* XXX */
679 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
680 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682 	/* When socket is gone, all binding information is lost.
683 	 * routing might fail in this case. No choice here, if we choose to force
684 	 * input interface, we will misroute in case of asymmetric route.
685 	 */
686 	if (sk)
687 		arg.bound_dev_if = sk->sk_bound_dev_if;
688 
689 	arg.tos = ip_hdr(skb)->tos;
690 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
692 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693 			      &arg, arg.iov[0].iov_len);
694 
695 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697 
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700 	if (sk1) {
701 		rcu_read_unlock();
702 		sock_put(sk1);
703 	}
704 #endif
705 }
706 
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708    outside socket context is ugly, certainly. What can I do?
709  */
710 
tcp_v4_send_ack(struct net * net,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)711 static void tcp_v4_send_ack(struct net *net,
712 			    struct sk_buff *skb, u32 seq, u32 ack,
713 			    u32 win, u32 tsval, u32 tsecr, int oif,
714 			    struct tcp_md5sig_key *key,
715 			    int reply_flags, u8 tos)
716 {
717 	const struct tcphdr *th = tcp_hdr(skb);
718 	struct {
719 		struct tcphdr th;
720 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
721 #ifdef CONFIG_TCP_MD5SIG
722 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
723 #endif
724 			];
725 	} rep;
726 	struct ip_reply_arg arg;
727 
728 	memset(&rep.th, 0, sizeof(struct tcphdr));
729 	memset(&arg, 0, sizeof(arg));
730 
731 	arg.iov[0].iov_base = (unsigned char *)&rep;
732 	arg.iov[0].iov_len  = sizeof(rep.th);
733 	if (tsecr) {
734 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735 				   (TCPOPT_TIMESTAMP << 8) |
736 				   TCPOLEN_TIMESTAMP);
737 		rep.opt[1] = htonl(tsval);
738 		rep.opt[2] = htonl(tsecr);
739 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740 	}
741 
742 	/* Swap the send and the receive. */
743 	rep.th.dest    = th->source;
744 	rep.th.source  = th->dest;
745 	rep.th.doff    = arg.iov[0].iov_len / 4;
746 	rep.th.seq     = htonl(seq);
747 	rep.th.ack_seq = htonl(ack);
748 	rep.th.ack     = 1;
749 	rep.th.window  = htons(win);
750 
751 #ifdef CONFIG_TCP_MD5SIG
752 	if (key) {
753 		int offset = (tsecr) ? 3 : 0;
754 
755 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756 					  (TCPOPT_NOP << 16) |
757 					  (TCPOPT_MD5SIG << 8) |
758 					  TCPOLEN_MD5SIG);
759 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760 		rep.th.doff = arg.iov[0].iov_len/4;
761 
762 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763 				    key, ip_hdr(skb)->saddr,
764 				    ip_hdr(skb)->daddr, &rep.th);
765 	}
766 #endif
767 	arg.flags = reply_flags;
768 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769 				      ip_hdr(skb)->saddr, /* XXX */
770 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
771 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772 	if (oif)
773 		arg.bound_dev_if = oif;
774 	arg.tos = tos;
775 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
777 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778 			      &arg, arg.iov[0].iov_len);
779 
780 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785 	struct inet_timewait_sock *tw = inet_twsk(sk);
786 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787 
788 	tcp_v4_send_ack(sock_net(sk), skb,
789 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
790 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
791 			tcp_time_stamp + tcptw->tw_ts_offset,
792 			tcptw->tw_ts_recent,
793 			tw->tw_bound_dev_if,
794 			tcp_twsk_md5_key(tcptw),
795 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
796 			tw->tw_tos
797 			);
798 
799 	inet_twsk_put(tw);
800 }
801 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)802 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
803 				  struct request_sock *req)
804 {
805 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
806 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
807 	 */
808 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
809 					     tcp_sk(sk)->snd_nxt;
810 
811 	tcp_v4_send_ack(sock_net(sk), skb, seq,
812 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
813 			tcp_time_stamp,
814 			req->ts_recent,
815 			0,
816 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
817 					  AF_INET),
818 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
819 			ip_hdr(skb)->tos);
820 }
821 
822 /*
823  *	Send a SYN-ACK after having received a SYN.
824  *	This still operates on a request_sock only, not on a big
825  *	socket.
826  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,bool attach_req)827 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
828 			      struct flowi *fl,
829 			      struct request_sock *req,
830 			      struct tcp_fastopen_cookie *foc,
831 				  bool attach_req)
832 {
833 	const struct inet_request_sock *ireq = inet_rsk(req);
834 	struct flowi4 fl4;
835 	int err = -1;
836 	struct sk_buff *skb;
837 
838 	/* First, grab a route. */
839 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840 		return -1;
841 
842 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
843 
844 	if (skb) {
845 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
846 
847 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
848 					    ireq->ir_rmt_addr,
849 					    ireq->opt);
850 		err = net_xmit_eval(err);
851 	}
852 
853 	return err;
854 }
855 
856 /*
857  *	IPv4 request_sock destructor.
858  */
tcp_v4_reqsk_destructor(struct request_sock * req)859 static void tcp_v4_reqsk_destructor(struct request_sock *req)
860 {
861 	kfree(inet_rsk(req)->opt);
862 }
863 
864 
865 #ifdef CONFIG_TCP_MD5SIG
866 /*
867  * RFC2385 MD5 checksumming requires a mapping of
868  * IP address->MD5 Key.
869  * We need to maintain these in the sk structure.
870  */
871 
872 /* Find the Key structure for an address.  */
tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)873 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
874 					 const union tcp_md5_addr *addr,
875 					 int family)
876 {
877 	const struct tcp_sock *tp = tcp_sk(sk);
878 	struct tcp_md5sig_key *key;
879 	unsigned int size = sizeof(struct in_addr);
880 	const struct tcp_md5sig_info *md5sig;
881 
882 	/* caller either holds rcu_read_lock() or socket lock */
883 	md5sig = rcu_dereference_check(tp->md5sig_info,
884 				       sock_owned_by_user(sk) ||
885 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
886 	if (!md5sig)
887 		return NULL;
888 #if IS_ENABLED(CONFIG_IPV6)
889 	if (family == AF_INET6)
890 		size = sizeof(struct in6_addr);
891 #endif
892 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
893 		if (key->family != family)
894 			continue;
895 		if (!memcmp(&key->addr, addr, size))
896 			return key;
897 	}
898 	return NULL;
899 }
900 EXPORT_SYMBOL(tcp_md5_do_lookup);
901 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)902 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
903 					 const struct sock *addr_sk)
904 {
905 	const union tcp_md5_addr *addr;
906 
907 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
908 	return tcp_md5_do_lookup(sk, addr, AF_INET);
909 }
910 EXPORT_SYMBOL(tcp_v4_md5_lookup);
911 
912 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,const u8 * newkey,u8 newkeylen,gfp_t gfp)913 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
914 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
915 {
916 	/* Add Key to the list */
917 	struct tcp_md5sig_key *key;
918 	struct tcp_sock *tp = tcp_sk(sk);
919 	struct tcp_md5sig_info *md5sig;
920 
921 	key = tcp_md5_do_lookup(sk, addr, family);
922 	if (key) {
923 		/* Pre-existing entry - just update that one. */
924 		memcpy(key->key, newkey, newkeylen);
925 		key->keylen = newkeylen;
926 		return 0;
927 	}
928 
929 	md5sig = rcu_dereference_protected(tp->md5sig_info,
930 					   sock_owned_by_user(sk) ||
931 					   lockdep_is_held(&sk->sk_lock.slock));
932 	if (!md5sig) {
933 		md5sig = kmalloc(sizeof(*md5sig), gfp);
934 		if (!md5sig)
935 			return -ENOMEM;
936 
937 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
938 		INIT_HLIST_HEAD(&md5sig->head);
939 		rcu_assign_pointer(tp->md5sig_info, md5sig);
940 	}
941 
942 	key = sock_kmalloc(sk, sizeof(*key), gfp);
943 	if (!key)
944 		return -ENOMEM;
945 	if (!tcp_alloc_md5sig_pool()) {
946 		sock_kfree_s(sk, key, sizeof(*key));
947 		return -ENOMEM;
948 	}
949 
950 	memcpy(key->key, newkey, newkeylen);
951 	key->keylen = newkeylen;
952 	key->family = family;
953 	memcpy(&key->addr, addr,
954 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
955 				      sizeof(struct in_addr));
956 	hlist_add_head_rcu(&key->node, &md5sig->head);
957 	return 0;
958 }
959 EXPORT_SYMBOL(tcp_md5_do_add);
960 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family)961 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
962 {
963 	struct tcp_md5sig_key *key;
964 
965 	key = tcp_md5_do_lookup(sk, addr, family);
966 	if (!key)
967 		return -ENOENT;
968 	hlist_del_rcu(&key->node);
969 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
970 	kfree_rcu(key, rcu);
971 	return 0;
972 }
973 EXPORT_SYMBOL(tcp_md5_do_del);
974 
tcp_clear_md5_list(struct sock * sk)975 static void tcp_clear_md5_list(struct sock *sk)
976 {
977 	struct tcp_sock *tp = tcp_sk(sk);
978 	struct tcp_md5sig_key *key;
979 	struct hlist_node *n;
980 	struct tcp_md5sig_info *md5sig;
981 
982 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
983 
984 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
985 		hlist_del_rcu(&key->node);
986 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
987 		kfree_rcu(key, rcu);
988 	}
989 }
990 
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)991 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
992 				 int optlen)
993 {
994 	struct tcp_md5sig cmd;
995 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
996 
997 	if (optlen < sizeof(cmd))
998 		return -EINVAL;
999 
1000 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1001 		return -EFAULT;
1002 
1003 	if (sin->sin_family != AF_INET)
1004 		return -EINVAL;
1005 
1006 	if (!cmd.tcpm_keylen)
1007 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1008 				      AF_INET);
1009 
1010 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1011 		return -EINVAL;
1012 
1013 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1014 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1015 			      GFP_KERNEL);
1016 }
1017 
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1018 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1019 					__be32 daddr, __be32 saddr, int nbytes)
1020 {
1021 	struct tcp4_pseudohdr *bp;
1022 	struct scatterlist sg;
1023 
1024 	bp = &hp->md5_blk.ip4;
1025 
1026 	/*
1027 	 * 1. the TCP pseudo-header (in the order: source IP address,
1028 	 * destination IP address, zero-padded protocol number, and
1029 	 * segment length)
1030 	 */
1031 	bp->saddr = saddr;
1032 	bp->daddr = daddr;
1033 	bp->pad = 0;
1034 	bp->protocol = IPPROTO_TCP;
1035 	bp->len = cpu_to_be16(nbytes);
1036 
1037 	sg_init_one(&sg, bp, sizeof(*bp));
1038 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1039 }
1040 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1041 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1042 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1043 {
1044 	struct tcp_md5sig_pool *hp;
1045 	struct hash_desc *desc;
1046 
1047 	hp = tcp_get_md5sig_pool();
1048 	if (!hp)
1049 		goto clear_hash_noput;
1050 	desc = &hp->md5_desc;
1051 
1052 	if (crypto_hash_init(desc))
1053 		goto clear_hash;
1054 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1055 		goto clear_hash;
1056 	if (tcp_md5_hash_header(hp, th))
1057 		goto clear_hash;
1058 	if (tcp_md5_hash_key(hp, key))
1059 		goto clear_hash;
1060 	if (crypto_hash_final(desc, md5_hash))
1061 		goto clear_hash;
1062 
1063 	tcp_put_md5sig_pool();
1064 	return 0;
1065 
1066 clear_hash:
1067 	tcp_put_md5sig_pool();
1068 clear_hash_noput:
1069 	memset(md5_hash, 0, 16);
1070 	return 1;
1071 }
1072 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1073 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1074 			const struct sock *sk,
1075 			const struct sk_buff *skb)
1076 {
1077 	struct tcp_md5sig_pool *hp;
1078 	struct hash_desc *desc;
1079 	const struct tcphdr *th = tcp_hdr(skb);
1080 	__be32 saddr, daddr;
1081 
1082 	if (sk) { /* valid for establish/request sockets */
1083 		saddr = sk->sk_rcv_saddr;
1084 		daddr = sk->sk_daddr;
1085 	} else {
1086 		const struct iphdr *iph = ip_hdr(skb);
1087 		saddr = iph->saddr;
1088 		daddr = iph->daddr;
1089 	}
1090 
1091 	hp = tcp_get_md5sig_pool();
1092 	if (!hp)
1093 		goto clear_hash_noput;
1094 	desc = &hp->md5_desc;
1095 
1096 	if (crypto_hash_init(desc))
1097 		goto clear_hash;
1098 
1099 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1100 		goto clear_hash;
1101 	if (tcp_md5_hash_header(hp, th))
1102 		goto clear_hash;
1103 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1104 		goto clear_hash;
1105 	if (tcp_md5_hash_key(hp, key))
1106 		goto clear_hash;
1107 	if (crypto_hash_final(desc, md5_hash))
1108 		goto clear_hash;
1109 
1110 	tcp_put_md5sig_pool();
1111 	return 0;
1112 
1113 clear_hash:
1114 	tcp_put_md5sig_pool();
1115 clear_hash_noput:
1116 	memset(md5_hash, 0, 16);
1117 	return 1;
1118 }
1119 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1120 
1121 #endif
1122 
1123 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1124 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1125 				    const struct sk_buff *skb)
1126 {
1127 #ifdef CONFIG_TCP_MD5SIG
1128 	/*
1129 	 * This gets called for each TCP segment that arrives
1130 	 * so we want to be efficient.
1131 	 * We have 3 drop cases:
1132 	 * o No MD5 hash and one expected.
1133 	 * o MD5 hash and we're not expecting one.
1134 	 * o MD5 hash and its wrong.
1135 	 */
1136 	const __u8 *hash_location = NULL;
1137 	struct tcp_md5sig_key *hash_expected;
1138 	const struct iphdr *iph = ip_hdr(skb);
1139 	const struct tcphdr *th = tcp_hdr(skb);
1140 	int genhash;
1141 	unsigned char newhash[16];
1142 
1143 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1144 					  AF_INET);
1145 	hash_location = tcp_parse_md5sig_option(th);
1146 
1147 	/* We've parsed the options - do we have a hash? */
1148 	if (!hash_expected && !hash_location)
1149 		return false;
1150 
1151 	if (hash_expected && !hash_location) {
1152 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1153 		return true;
1154 	}
1155 
1156 	if (!hash_expected && hash_location) {
1157 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1158 		return true;
1159 	}
1160 
1161 	/* Okay, so this is hash_expected and hash_location -
1162 	 * so we need to calculate the checksum.
1163 	 */
1164 	genhash = tcp_v4_md5_hash_skb(newhash,
1165 				      hash_expected,
1166 				      NULL, skb);
1167 
1168 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1169 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1170 				     &iph->saddr, ntohs(th->source),
1171 				     &iph->daddr, ntohs(th->dest),
1172 				     genhash ? " tcp_v4_calc_md5_hash failed"
1173 				     : "");
1174 		return true;
1175 	}
1176 	return false;
1177 #endif
1178 	return false;
1179 }
1180 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1181 static void tcp_v4_init_req(struct request_sock *req,
1182 			    const struct sock *sk_listener,
1183 			    struct sk_buff *skb)
1184 {
1185 	struct inet_request_sock *ireq = inet_rsk(req);
1186 
1187 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1188 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1189 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1190 	ireq->opt = tcp_v4_save_options(skb);
1191 }
1192 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req,bool * strict)1193 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1194 					  struct flowi *fl,
1195 					  const struct request_sock *req,
1196 					  bool *strict)
1197 {
1198 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1199 
1200 	if (strict) {
1201 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1202 			*strict = true;
1203 		else
1204 			*strict = false;
1205 	}
1206 
1207 	return dst;
1208 }
1209 
1210 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1211 	.family		=	PF_INET,
1212 	.obj_size	=	sizeof(struct tcp_request_sock),
1213 	.rtx_syn_ack	=	tcp_rtx_synack,
1214 	.send_ack	=	tcp_v4_reqsk_send_ack,
1215 	.destructor	=	tcp_v4_reqsk_destructor,
1216 	.send_reset	=	tcp_v4_send_reset,
1217 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1218 };
1219 
1220 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221 	.mss_clamp	=	TCP_MSS_DEFAULT,
1222 #ifdef CONFIG_TCP_MD5SIG
1223 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1224 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1225 #endif
1226 	.init_req	=	tcp_v4_init_req,
1227 #ifdef CONFIG_SYN_COOKIES
1228 	.cookie_init_seq =	cookie_v4_init_sequence,
1229 #endif
1230 	.route_req	=	tcp_v4_route_req,
1231 	.init_seq	=	tcp_v4_init_sequence,
1232 	.send_synack	=	tcp_v4_send_synack,
1233 };
1234 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1235 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1236 {
1237 	/* Never answer to SYNs send to broadcast or multicast */
1238 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239 		goto drop;
1240 
1241 	return tcp_conn_request(&tcp_request_sock_ops,
1242 				&tcp_request_sock_ipv4_ops, sk, skb);
1243 
1244 drop:
1245 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1246 	return 0;
1247 }
1248 EXPORT_SYMBOL(tcp_v4_conn_request);
1249 
1250 
1251 /*
1252  * The three way handshake has completed - we got a valid synack -
1253  * now create the new socket.
1254  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1255 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1256 				  struct request_sock *req,
1257 				  struct dst_entry *dst,
1258 				  struct request_sock *req_unhash,
1259 				  bool *own_req)
1260 {
1261 	struct inet_request_sock *ireq;
1262 	struct inet_sock *newinet;
1263 	struct tcp_sock *newtp;
1264 	struct sock *newsk;
1265 #ifdef CONFIG_TCP_MD5SIG
1266 	struct tcp_md5sig_key *key;
1267 #endif
1268 	struct ip_options_rcu *inet_opt;
1269 
1270 	if (sk_acceptq_is_full(sk))
1271 		goto exit_overflow;
1272 
1273 	newsk = tcp_create_openreq_child(sk, req, skb);
1274 	if (!newsk)
1275 		goto exit_nonewsk;
1276 
1277 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1278 	inet_sk_rx_dst_set(newsk, skb);
1279 
1280 	newtp		      = tcp_sk(newsk);
1281 	newinet		      = inet_sk(newsk);
1282 	ireq		      = inet_rsk(req);
1283 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1284 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1285 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1286 	inet_opt	      = ireq->opt;
1287 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1288 	ireq->opt	      = NULL;
1289 	newinet->mc_index     = inet_iif(skb);
1290 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1291 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1292 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1293 	if (inet_opt)
1294 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1295 	newinet->inet_id = newtp->write_seq ^ jiffies;
1296 
1297 	if (!dst) {
1298 		dst = inet_csk_route_child_sock(sk, newsk, req);
1299 		if (!dst)
1300 			goto put_and_exit;
1301 	} else {
1302 		/* syncookie case : see end of cookie_v4_check() */
1303 	}
1304 	sk_setup_caps(newsk, dst);
1305 
1306 	tcp_ca_openreq_child(newsk, dst);
1307 
1308 	tcp_sync_mss(newsk, dst_mtu(dst));
1309 	newtp->advmss = dst_metric_advmss(dst);
1310 	if (tcp_sk(sk)->rx_opt.user_mss &&
1311 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1312 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1313 
1314 	tcp_initialize_rcv_mss(newsk);
1315 
1316 #ifdef CONFIG_TCP_MD5SIG
1317 	/* Copy over the MD5 key from the original socket */
1318 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1319 				AF_INET);
1320 	if (key) {
1321 		/*
1322 		 * We're using one, so create a matching key
1323 		 * on the newsk structure. If we fail to get
1324 		 * memory, then we end up not copying the key
1325 		 * across. Shucks.
1326 		 */
1327 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1329 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1330 	}
1331 #endif
1332 
1333 	if (__inet_inherit_port(sk, newsk) < 0)
1334 		goto put_and_exit;
1335 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1336 	if (*own_req)
1337 		tcp_move_syn(newtp, req);
1338 
1339 	return newsk;
1340 
1341 exit_overflow:
1342 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1343 exit_nonewsk:
1344 	dst_release(dst);
1345 exit:
1346 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1347 	return NULL;
1348 put_and_exit:
1349 	inet_csk_prepare_forced_close(newsk);
1350 	tcp_done(newsk);
1351 	goto exit;
1352 }
1353 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1354 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1355 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1356 {
1357 #ifdef CONFIG_SYN_COOKIES
1358 	const struct tcphdr *th = tcp_hdr(skb);
1359 
1360 	if (!th->syn)
1361 		sk = cookie_v4_check(sk, skb);
1362 #endif
1363 	return sk;
1364 }
1365 
1366 /* The socket must have it's spinlock held when we get
1367  * here, unless it is a TCP_LISTEN socket.
1368  *
1369  * We have a potential double-lock case here, so even when
1370  * doing backlog processing we use the BH locking scheme.
1371  * This is because we cannot sleep with the original spinlock
1372  * held.
1373  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1374 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1375 {
1376 	struct sock *rsk;
1377 
1378 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1379 		struct dst_entry *dst = sk->sk_rx_dst;
1380 
1381 		sock_rps_save_rxhash(sk, skb);
1382 		sk_mark_napi_id(sk, skb);
1383 		if (dst) {
1384 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1385 			    !dst->ops->check(dst, 0)) {
1386 				dst_release(dst);
1387 				sk->sk_rx_dst = NULL;
1388 			}
1389 		}
1390 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1391 		return 0;
1392 	}
1393 
1394 	if (tcp_checksum_complete(skb))
1395 		goto csum_err;
1396 
1397 	if (sk->sk_state == TCP_LISTEN) {
1398 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1399 
1400 		if (!nsk)
1401 			goto discard;
1402 		if (nsk != sk) {
1403 			sock_rps_save_rxhash(nsk, skb);
1404 			sk_mark_napi_id(nsk, skb);
1405 			if (tcp_child_process(sk, nsk, skb)) {
1406 				rsk = nsk;
1407 				goto reset;
1408 			}
1409 			return 0;
1410 		}
1411 	} else
1412 		sock_rps_save_rxhash(sk, skb);
1413 
1414 	if (tcp_rcv_state_process(sk, skb)) {
1415 		rsk = sk;
1416 		goto reset;
1417 	}
1418 	return 0;
1419 
1420 reset:
1421 	tcp_v4_send_reset(rsk, skb);
1422 discard:
1423 	kfree_skb(skb);
1424 	/* Be careful here. If this function gets more complicated and
1425 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1426 	 * might be destroyed here. This current version compiles correctly,
1427 	 * but you have been warned.
1428 	 */
1429 	return 0;
1430 
1431 csum_err:
1432 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1433 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1434 	goto discard;
1435 }
1436 EXPORT_SYMBOL(tcp_v4_do_rcv);
1437 
tcp_v4_early_demux(struct sk_buff * skb)1438 void tcp_v4_early_demux(struct sk_buff *skb)
1439 {
1440 	const struct iphdr *iph;
1441 	const struct tcphdr *th;
1442 	struct sock *sk;
1443 
1444 	if (skb->pkt_type != PACKET_HOST)
1445 		return;
1446 
1447 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1448 		return;
1449 
1450 	iph = ip_hdr(skb);
1451 	th = tcp_hdr(skb);
1452 
1453 	if (th->doff < sizeof(struct tcphdr) / 4)
1454 		return;
1455 
1456 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1457 				       iph->saddr, th->source,
1458 				       iph->daddr, ntohs(th->dest),
1459 				       skb->skb_iif);
1460 	if (sk) {
1461 		skb->sk = sk;
1462 		skb->destructor = sock_edemux;
1463 		if (sk_fullsock(sk)) {
1464 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1465 
1466 			if (dst)
1467 				dst = dst_check(dst, 0);
1468 			if (dst &&
1469 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1470 				skb_dst_set_noref(skb, dst);
1471 		}
1472 	}
1473 }
1474 
1475 /* Packet is added to VJ-style prequeue for processing in process
1476  * context, if a reader task is waiting. Apparently, this exciting
1477  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1478  * failed somewhere. Latency? Burstiness? Well, at least now we will
1479  * see, why it failed. 8)8)				  --ANK
1480  *
1481  */
tcp_prequeue(struct sock * sk,struct sk_buff * skb)1482 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1483 {
1484 	struct tcp_sock *tp = tcp_sk(sk);
1485 
1486 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1487 		return false;
1488 
1489 	if (skb->len <= tcp_hdrlen(skb) &&
1490 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1491 		return false;
1492 
1493 	/* Before escaping RCU protected region, we need to take care of skb
1494 	 * dst. Prequeue is only enabled for established sockets.
1495 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1496 	 * Instead of doing full sk_rx_dst validity here, let's perform
1497 	 * an optimistic check.
1498 	 */
1499 	if (likely(sk->sk_rx_dst))
1500 		skb_dst_drop(skb);
1501 	else
1502 		skb_dst_force_safe(skb);
1503 
1504 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1505 	tp->ucopy.memory += skb->truesize;
1506 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1507 		struct sk_buff *skb1;
1508 
1509 		BUG_ON(sock_owned_by_user(sk));
1510 
1511 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1512 			sk_backlog_rcv(sk, skb1);
1513 			NET_INC_STATS_BH(sock_net(sk),
1514 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1515 		}
1516 
1517 		tp->ucopy.memory = 0;
1518 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1519 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1520 					   POLLIN | POLLRDNORM | POLLRDBAND);
1521 		if (!inet_csk_ack_scheduled(sk))
1522 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1523 						  (3 * tcp_rto_min(sk)) / 4,
1524 						  TCP_RTO_MAX);
1525 	}
1526 	return true;
1527 }
1528 EXPORT_SYMBOL(tcp_prequeue);
1529 
1530 /*
1531  *	From tcp_input.c
1532  */
1533 
tcp_v4_rcv(struct sk_buff * skb)1534 int tcp_v4_rcv(struct sk_buff *skb)
1535 {
1536 	const struct iphdr *iph;
1537 	const struct tcphdr *th;
1538 	struct sock *sk;
1539 	int ret;
1540 	struct net *net = dev_net(skb->dev);
1541 
1542 	if (skb->pkt_type != PACKET_HOST)
1543 		goto discard_it;
1544 
1545 	/* Count it even if it's bad */
1546 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1547 
1548 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1549 		goto discard_it;
1550 
1551 	th = tcp_hdr(skb);
1552 
1553 	if (th->doff < sizeof(struct tcphdr) / 4)
1554 		goto bad_packet;
1555 	if (!pskb_may_pull(skb, th->doff * 4))
1556 		goto discard_it;
1557 
1558 	/* An explanation is required here, I think.
1559 	 * Packet length and doff are validated by header prediction,
1560 	 * provided case of th->doff==0 is eliminated.
1561 	 * So, we defer the checks. */
1562 
1563 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1564 		goto csum_error;
1565 
1566 	th = tcp_hdr(skb);
1567 	iph = ip_hdr(skb);
1568 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1569 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1570 	 */
1571 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1572 		sizeof(struct inet_skb_parm));
1573 	barrier();
1574 
1575 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1576 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1577 				    skb->len - th->doff * 4);
1578 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1579 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1580 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1581 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1582 	TCP_SKB_CB(skb)->sacked	 = 0;
1583 
1584 lookup:
1585 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1586 	if (!sk)
1587 		goto no_tcp_socket;
1588 
1589 process:
1590 	if (sk->sk_state == TCP_TIME_WAIT)
1591 		goto do_time_wait;
1592 
1593 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1594 		struct request_sock *req = inet_reqsk(sk);
1595 		struct sock *nsk;
1596 
1597 		sk = req->rsk_listener;
1598 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1599 			reqsk_put(req);
1600 			goto discard_it;
1601 		}
1602 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1603 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1604 			goto lookup;
1605 		}
1606 		sock_hold(sk);
1607 		nsk = tcp_check_req(sk, skb, req, false);
1608 		if (!nsk) {
1609 			reqsk_put(req);
1610 			goto discard_and_relse;
1611 		}
1612 		if (nsk == sk) {
1613 			reqsk_put(req);
1614 		} else if (tcp_child_process(sk, nsk, skb)) {
1615 			tcp_v4_send_reset(nsk, skb);
1616 			goto discard_and_relse;
1617 		} else {
1618 			sock_put(sk);
1619 			return 0;
1620 		}
1621 	}
1622 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1623 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1624 		goto discard_and_relse;
1625 	}
1626 
1627 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1628 		goto discard_and_relse;
1629 
1630 	if (tcp_v4_inbound_md5_hash(sk, skb))
1631 		goto discard_and_relse;
1632 
1633 	nf_reset(skb);
1634 
1635 	if (sk_filter(sk, skb))
1636 		goto discard_and_relse;
1637 
1638 	skb->dev = NULL;
1639 
1640 	if (sk->sk_state == TCP_LISTEN) {
1641 		ret = tcp_v4_do_rcv(sk, skb);
1642 		goto put_and_return;
1643 	}
1644 
1645 	sk_incoming_cpu_update(sk);
1646 
1647 	bh_lock_sock_nested(sk);
1648 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1649 	ret = 0;
1650 	if (!sock_owned_by_user(sk)) {
1651 		if (!tcp_prequeue(sk, skb))
1652 			ret = tcp_v4_do_rcv(sk, skb);
1653 	} else if (unlikely(sk_add_backlog(sk, skb,
1654 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1655 		bh_unlock_sock(sk);
1656 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1657 		goto discard_and_relse;
1658 	}
1659 	bh_unlock_sock(sk);
1660 
1661 put_and_return:
1662 	sock_put(sk);
1663 
1664 	return ret;
1665 
1666 no_tcp_socket:
1667 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1668 		goto discard_it;
1669 
1670 	if (tcp_checksum_complete(skb)) {
1671 csum_error:
1672 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1673 bad_packet:
1674 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1675 	} else {
1676 		tcp_v4_send_reset(NULL, skb);
1677 	}
1678 
1679 discard_it:
1680 	/* Discard frame. */
1681 	kfree_skb(skb);
1682 	return 0;
1683 
1684 discard_and_relse:
1685 	sock_put(sk);
1686 	goto discard_it;
1687 
1688 do_time_wait:
1689 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1690 		inet_twsk_put(inet_twsk(sk));
1691 		goto discard_it;
1692 	}
1693 
1694 	if (tcp_checksum_complete(skb)) {
1695 		inet_twsk_put(inet_twsk(sk));
1696 		goto csum_error;
1697 	}
1698 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1699 	case TCP_TW_SYN: {
1700 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1701 							&tcp_hashinfo,
1702 							iph->saddr, th->source,
1703 							iph->daddr, th->dest,
1704 							inet_iif(skb));
1705 		if (sk2) {
1706 			inet_twsk_deschedule_put(inet_twsk(sk));
1707 			sk = sk2;
1708 			goto process;
1709 		}
1710 		/* Fall through to ACK */
1711 	}
1712 	case TCP_TW_ACK:
1713 		tcp_v4_timewait_ack(sk, skb);
1714 		break;
1715 	case TCP_TW_RST:
1716 		goto no_tcp_socket;
1717 	case TCP_TW_SUCCESS:;
1718 	}
1719 	goto discard_it;
1720 }
1721 
1722 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1723 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1724 	.twsk_unique	= tcp_twsk_unique,
1725 	.twsk_destructor= tcp_twsk_destructor,
1726 };
1727 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)1728 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1729 {
1730 	struct dst_entry *dst = skb_dst(skb);
1731 
1732 	if (dst && dst_hold_safe(dst)) {
1733 		sk->sk_rx_dst = dst;
1734 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1735 	}
1736 }
1737 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1738 
1739 const struct inet_connection_sock_af_ops ipv4_specific = {
1740 	.queue_xmit	   = ip_queue_xmit,
1741 	.send_check	   = tcp_v4_send_check,
1742 	.rebuild_header	   = inet_sk_rebuild_header,
1743 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1744 	.conn_request	   = tcp_v4_conn_request,
1745 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1746 	.net_header_len	   = sizeof(struct iphdr),
1747 	.setsockopt	   = ip_setsockopt,
1748 	.getsockopt	   = ip_getsockopt,
1749 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1750 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1751 	.bind_conflict	   = inet_csk_bind_conflict,
1752 #ifdef CONFIG_COMPAT
1753 	.compat_setsockopt = compat_ip_setsockopt,
1754 	.compat_getsockopt = compat_ip_getsockopt,
1755 #endif
1756 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1757 };
1758 EXPORT_SYMBOL(ipv4_specific);
1759 
1760 #ifdef CONFIG_TCP_MD5SIG
1761 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1762 	.md5_lookup		= tcp_v4_md5_lookup,
1763 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1764 	.md5_parse		= tcp_v4_parse_md5_keys,
1765 };
1766 #endif
1767 
1768 /* NOTE: A lot of things set to zero explicitly by call to
1769  *       sk_alloc() so need not be done here.
1770  */
tcp_v4_init_sock(struct sock * sk)1771 static int tcp_v4_init_sock(struct sock *sk)
1772 {
1773 	struct inet_connection_sock *icsk = inet_csk(sk);
1774 
1775 	tcp_init_sock(sk);
1776 
1777 	icsk->icsk_af_ops = &ipv4_specific;
1778 
1779 #ifdef CONFIG_TCP_MD5SIG
1780 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1781 #endif
1782 
1783 	return 0;
1784 }
1785 
tcp_v4_destroy_sock(struct sock * sk)1786 void tcp_v4_destroy_sock(struct sock *sk)
1787 {
1788 	struct tcp_sock *tp = tcp_sk(sk);
1789 
1790 	tcp_clear_xmit_timers(sk);
1791 
1792 	tcp_cleanup_congestion_control(sk);
1793 
1794 	/* Cleanup up the write buffer. */
1795 	tcp_write_queue_purge(sk);
1796 
1797 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1798 	__skb_queue_purge(&tp->out_of_order_queue);
1799 
1800 #ifdef CONFIG_TCP_MD5SIG
1801 	/* Clean up the MD5 key list, if any */
1802 	if (tp->md5sig_info) {
1803 		tcp_clear_md5_list(sk);
1804 		kfree_rcu(tp->md5sig_info, rcu);
1805 		tp->md5sig_info = NULL;
1806 	}
1807 #endif
1808 
1809 	/* Clean prequeue, it must be empty really */
1810 	__skb_queue_purge(&tp->ucopy.prequeue);
1811 
1812 	/* Clean up a referenced TCP bind bucket. */
1813 	if (inet_csk(sk)->icsk_bind_hash)
1814 		inet_put_port(sk);
1815 
1816 	BUG_ON(tp->fastopen_rsk);
1817 
1818 	/* If socket is aborted during connect operation */
1819 	tcp_free_fastopen_req(tp);
1820 	tcp_saved_syn_free(tp);
1821 
1822 	sk_sockets_allocated_dec(sk);
1823 	sock_release_memcg(sk);
1824 }
1825 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1826 
1827 #ifdef CONFIG_PROC_FS
1828 /* Proc filesystem TCP sock list dumping. */
1829 
1830 /*
1831  * Get next listener socket follow cur.  If cur is NULL, get first socket
1832  * starting from bucket given in st->bucket; when st->bucket is zero the
1833  * very first socket in the hash table is returned.
1834  */
listening_get_next(struct seq_file * seq,void * cur)1835 static void *listening_get_next(struct seq_file *seq, void *cur)
1836 {
1837 	struct inet_connection_sock *icsk;
1838 	struct hlist_nulls_node *node;
1839 	struct sock *sk = cur;
1840 	struct inet_listen_hashbucket *ilb;
1841 	struct tcp_iter_state *st = seq->private;
1842 	struct net *net = seq_file_net(seq);
1843 
1844 	if (!sk) {
1845 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1846 		spin_lock_bh(&ilb->lock);
1847 		sk = sk_nulls_head(&ilb->head);
1848 		st->offset = 0;
1849 		goto get_sk;
1850 	}
1851 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1852 	++st->num;
1853 	++st->offset;
1854 
1855 	sk = sk_nulls_next(sk);
1856 get_sk:
1857 	sk_nulls_for_each_from(sk, node) {
1858 		if (!net_eq(sock_net(sk), net))
1859 			continue;
1860 		if (sk->sk_family == st->family) {
1861 			cur = sk;
1862 			goto out;
1863 		}
1864 		icsk = inet_csk(sk);
1865 	}
1866 	spin_unlock_bh(&ilb->lock);
1867 	st->offset = 0;
1868 	if (++st->bucket < INET_LHTABLE_SIZE) {
1869 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870 		spin_lock_bh(&ilb->lock);
1871 		sk = sk_nulls_head(&ilb->head);
1872 		goto get_sk;
1873 	}
1874 	cur = NULL;
1875 out:
1876 	return cur;
1877 }
1878 
listening_get_idx(struct seq_file * seq,loff_t * pos)1879 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1880 {
1881 	struct tcp_iter_state *st = seq->private;
1882 	void *rc;
1883 
1884 	st->bucket = 0;
1885 	st->offset = 0;
1886 	rc = listening_get_next(seq, NULL);
1887 
1888 	while (rc && *pos) {
1889 		rc = listening_get_next(seq, rc);
1890 		--*pos;
1891 	}
1892 	return rc;
1893 }
1894 
empty_bucket(const struct tcp_iter_state * st)1895 static inline bool empty_bucket(const struct tcp_iter_state *st)
1896 {
1897 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1898 }
1899 
1900 /*
1901  * Get first established socket starting from bucket given in st->bucket.
1902  * If st->bucket is zero, the very first socket in the hash is returned.
1903  */
established_get_first(struct seq_file * seq)1904 static void *established_get_first(struct seq_file *seq)
1905 {
1906 	struct tcp_iter_state *st = seq->private;
1907 	struct net *net = seq_file_net(seq);
1908 	void *rc = NULL;
1909 
1910 	st->offset = 0;
1911 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1912 		struct sock *sk;
1913 		struct hlist_nulls_node *node;
1914 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1915 
1916 		/* Lockless fast path for the common case of empty buckets */
1917 		if (empty_bucket(st))
1918 			continue;
1919 
1920 		spin_lock_bh(lock);
1921 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1922 			if (sk->sk_family != st->family ||
1923 			    !net_eq(sock_net(sk), net)) {
1924 				continue;
1925 			}
1926 			rc = sk;
1927 			goto out;
1928 		}
1929 		spin_unlock_bh(lock);
1930 	}
1931 out:
1932 	return rc;
1933 }
1934 
established_get_next(struct seq_file * seq,void * cur)1935 static void *established_get_next(struct seq_file *seq, void *cur)
1936 {
1937 	struct sock *sk = cur;
1938 	struct hlist_nulls_node *node;
1939 	struct tcp_iter_state *st = seq->private;
1940 	struct net *net = seq_file_net(seq);
1941 
1942 	++st->num;
1943 	++st->offset;
1944 
1945 	sk = sk_nulls_next(sk);
1946 
1947 	sk_nulls_for_each_from(sk, node) {
1948 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1949 			return sk;
1950 	}
1951 
1952 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1953 	++st->bucket;
1954 	return established_get_first(seq);
1955 }
1956 
established_get_idx(struct seq_file * seq,loff_t pos)1957 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1958 {
1959 	struct tcp_iter_state *st = seq->private;
1960 	void *rc;
1961 
1962 	st->bucket = 0;
1963 	rc = established_get_first(seq);
1964 
1965 	while (rc && pos) {
1966 		rc = established_get_next(seq, rc);
1967 		--pos;
1968 	}
1969 	return rc;
1970 }
1971 
tcp_get_idx(struct seq_file * seq,loff_t pos)1972 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1973 {
1974 	void *rc;
1975 	struct tcp_iter_state *st = seq->private;
1976 
1977 	st->state = TCP_SEQ_STATE_LISTENING;
1978 	rc	  = listening_get_idx(seq, &pos);
1979 
1980 	if (!rc) {
1981 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1982 		rc	  = established_get_idx(seq, pos);
1983 	}
1984 
1985 	return rc;
1986 }
1987 
tcp_seek_last_pos(struct seq_file * seq)1988 static void *tcp_seek_last_pos(struct seq_file *seq)
1989 {
1990 	struct tcp_iter_state *st = seq->private;
1991 	int offset = st->offset;
1992 	int orig_num = st->num;
1993 	void *rc = NULL;
1994 
1995 	switch (st->state) {
1996 	case TCP_SEQ_STATE_LISTENING:
1997 		if (st->bucket >= INET_LHTABLE_SIZE)
1998 			break;
1999 		st->state = TCP_SEQ_STATE_LISTENING;
2000 		rc = listening_get_next(seq, NULL);
2001 		while (offset-- && rc)
2002 			rc = listening_get_next(seq, rc);
2003 		if (rc)
2004 			break;
2005 		st->bucket = 0;
2006 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2007 		/* Fallthrough */
2008 	case TCP_SEQ_STATE_ESTABLISHED:
2009 		if (st->bucket > tcp_hashinfo.ehash_mask)
2010 			break;
2011 		rc = established_get_first(seq);
2012 		while (offset-- && rc)
2013 			rc = established_get_next(seq, rc);
2014 	}
2015 
2016 	st->num = orig_num;
2017 
2018 	return rc;
2019 }
2020 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2021 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2022 {
2023 	struct tcp_iter_state *st = seq->private;
2024 	void *rc;
2025 
2026 	if (*pos && *pos == st->last_pos) {
2027 		rc = tcp_seek_last_pos(seq);
2028 		if (rc)
2029 			goto out;
2030 	}
2031 
2032 	st->state = TCP_SEQ_STATE_LISTENING;
2033 	st->num = 0;
2034 	st->bucket = 0;
2035 	st->offset = 0;
2036 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2037 
2038 out:
2039 	st->last_pos = *pos;
2040 	return rc;
2041 }
2042 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2043 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2044 {
2045 	struct tcp_iter_state *st = seq->private;
2046 	void *rc = NULL;
2047 
2048 	if (v == SEQ_START_TOKEN) {
2049 		rc = tcp_get_idx(seq, 0);
2050 		goto out;
2051 	}
2052 
2053 	switch (st->state) {
2054 	case TCP_SEQ_STATE_LISTENING:
2055 		rc = listening_get_next(seq, v);
2056 		if (!rc) {
2057 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2058 			st->bucket = 0;
2059 			st->offset = 0;
2060 			rc	  = established_get_first(seq);
2061 		}
2062 		break;
2063 	case TCP_SEQ_STATE_ESTABLISHED:
2064 		rc = established_get_next(seq, v);
2065 		break;
2066 	}
2067 out:
2068 	++*pos;
2069 	st->last_pos = *pos;
2070 	return rc;
2071 }
2072 
tcp_seq_stop(struct seq_file * seq,void * v)2073 static void tcp_seq_stop(struct seq_file *seq, void *v)
2074 {
2075 	struct tcp_iter_state *st = seq->private;
2076 
2077 	switch (st->state) {
2078 	case TCP_SEQ_STATE_LISTENING:
2079 		if (v != SEQ_START_TOKEN)
2080 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2081 		break;
2082 	case TCP_SEQ_STATE_ESTABLISHED:
2083 		if (v)
2084 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2085 		break;
2086 	}
2087 }
2088 
tcp_seq_open(struct inode * inode,struct file * file)2089 int tcp_seq_open(struct inode *inode, struct file *file)
2090 {
2091 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2092 	struct tcp_iter_state *s;
2093 	int err;
2094 
2095 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2096 			  sizeof(struct tcp_iter_state));
2097 	if (err < 0)
2098 		return err;
2099 
2100 	s = ((struct seq_file *)file->private_data)->private;
2101 	s->family		= afinfo->family;
2102 	s->last_pos		= 0;
2103 	return 0;
2104 }
2105 EXPORT_SYMBOL(tcp_seq_open);
2106 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2107 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2108 {
2109 	int rc = 0;
2110 	struct proc_dir_entry *p;
2111 
2112 	afinfo->seq_ops.start		= tcp_seq_start;
2113 	afinfo->seq_ops.next		= tcp_seq_next;
2114 	afinfo->seq_ops.stop		= tcp_seq_stop;
2115 
2116 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2117 			     afinfo->seq_fops, afinfo);
2118 	if (!p)
2119 		rc = -ENOMEM;
2120 	return rc;
2121 }
2122 EXPORT_SYMBOL(tcp_proc_register);
2123 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2124 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2125 {
2126 	remove_proc_entry(afinfo->name, net->proc_net);
2127 }
2128 EXPORT_SYMBOL(tcp_proc_unregister);
2129 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2130 static void get_openreq4(const struct request_sock *req,
2131 			 struct seq_file *f, int i)
2132 {
2133 	const struct inet_request_sock *ireq = inet_rsk(req);
2134 	long delta = req->rsk_timer.expires - jiffies;
2135 
2136 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2137 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2138 		i,
2139 		ireq->ir_loc_addr,
2140 		ireq->ir_num,
2141 		ireq->ir_rmt_addr,
2142 		ntohs(ireq->ir_rmt_port),
2143 		TCP_SYN_RECV,
2144 		0, 0, /* could print option size, but that is af dependent. */
2145 		1,    /* timers active (only the expire timer) */
2146 		jiffies_delta_to_clock_t(delta),
2147 		req->num_timeout,
2148 		from_kuid_munged(seq_user_ns(f),
2149 				 sock_i_uid(req->rsk_listener)),
2150 		0,  /* non standard timer */
2151 		0, /* open_requests have no inode */
2152 		0,
2153 		req);
2154 }
2155 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2156 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2157 {
2158 	int timer_active;
2159 	unsigned long timer_expires;
2160 	const struct tcp_sock *tp = tcp_sk(sk);
2161 	const struct inet_connection_sock *icsk = inet_csk(sk);
2162 	const struct inet_sock *inet = inet_sk(sk);
2163 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2164 	__be32 dest = inet->inet_daddr;
2165 	__be32 src = inet->inet_rcv_saddr;
2166 	__u16 destp = ntohs(inet->inet_dport);
2167 	__u16 srcp = ntohs(inet->inet_sport);
2168 	int rx_queue;
2169 	int state;
2170 
2171 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2172 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2173 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2174 		timer_active	= 1;
2175 		timer_expires	= icsk->icsk_timeout;
2176 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2177 		timer_active	= 4;
2178 		timer_expires	= icsk->icsk_timeout;
2179 	} else if (timer_pending(&sk->sk_timer)) {
2180 		timer_active	= 2;
2181 		timer_expires	= sk->sk_timer.expires;
2182 	} else {
2183 		timer_active	= 0;
2184 		timer_expires = jiffies;
2185 	}
2186 
2187 	state = sk_state_load(sk);
2188 	if (state == TCP_LISTEN)
2189 		rx_queue = sk->sk_ack_backlog;
2190 	else
2191 		/* Because we don't lock the socket,
2192 		 * we might find a transient negative value.
2193 		 */
2194 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2195 
2196 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2197 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2198 		i, src, srcp, dest, destp, state,
2199 		tp->write_seq - tp->snd_una,
2200 		rx_queue,
2201 		timer_active,
2202 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2203 		icsk->icsk_retransmits,
2204 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2205 		icsk->icsk_probes_out,
2206 		sock_i_ino(sk),
2207 		atomic_read(&sk->sk_refcnt), sk,
2208 		jiffies_to_clock_t(icsk->icsk_rto),
2209 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2210 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2211 		tp->snd_cwnd,
2212 		state == TCP_LISTEN ?
2213 		    fastopenq->max_qlen :
2214 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2215 }
2216 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2217 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2218 			       struct seq_file *f, int i)
2219 {
2220 	long delta = tw->tw_timer.expires - jiffies;
2221 	__be32 dest, src;
2222 	__u16 destp, srcp;
2223 
2224 	dest  = tw->tw_daddr;
2225 	src   = tw->tw_rcv_saddr;
2226 	destp = ntohs(tw->tw_dport);
2227 	srcp  = ntohs(tw->tw_sport);
2228 
2229 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2230 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2231 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2232 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2233 		atomic_read(&tw->tw_refcnt), tw);
2234 }
2235 
2236 #define TMPSZ 150
2237 
tcp4_seq_show(struct seq_file * seq,void * v)2238 static int tcp4_seq_show(struct seq_file *seq, void *v)
2239 {
2240 	struct tcp_iter_state *st;
2241 	struct sock *sk = v;
2242 
2243 	seq_setwidth(seq, TMPSZ - 1);
2244 	if (v == SEQ_START_TOKEN) {
2245 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2246 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2247 			   "inode");
2248 		goto out;
2249 	}
2250 	st = seq->private;
2251 
2252 	if (sk->sk_state == TCP_TIME_WAIT)
2253 		get_timewait4_sock(v, seq, st->num);
2254 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2255 		get_openreq4(v, seq, st->num);
2256 	else
2257 		get_tcp4_sock(v, seq, st->num);
2258 out:
2259 	seq_pad(seq, '\n');
2260 	return 0;
2261 }
2262 
2263 static const struct file_operations tcp_afinfo_seq_fops = {
2264 	.owner   = THIS_MODULE,
2265 	.open    = tcp_seq_open,
2266 	.read    = seq_read,
2267 	.llseek  = seq_lseek,
2268 	.release = seq_release_net
2269 };
2270 
2271 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2272 	.name		= "tcp",
2273 	.family		= AF_INET,
2274 	.seq_fops	= &tcp_afinfo_seq_fops,
2275 	.seq_ops	= {
2276 		.show		= tcp4_seq_show,
2277 	},
2278 };
2279 
tcp4_proc_init_net(struct net * net)2280 static int __net_init tcp4_proc_init_net(struct net *net)
2281 {
2282 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2283 }
2284 
tcp4_proc_exit_net(struct net * net)2285 static void __net_exit tcp4_proc_exit_net(struct net *net)
2286 {
2287 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2288 }
2289 
2290 static struct pernet_operations tcp4_net_ops = {
2291 	.init = tcp4_proc_init_net,
2292 	.exit = tcp4_proc_exit_net,
2293 };
2294 
tcp4_proc_init(void)2295 int __init tcp4_proc_init(void)
2296 {
2297 	return register_pernet_subsys(&tcp4_net_ops);
2298 }
2299 
tcp4_proc_exit(void)2300 void tcp4_proc_exit(void)
2301 {
2302 	unregister_pernet_subsys(&tcp4_net_ops);
2303 }
2304 #endif /* CONFIG_PROC_FS */
2305 
2306 struct proto tcp_prot = {
2307 	.name			= "TCP",
2308 	.owner			= THIS_MODULE,
2309 	.close			= tcp_close,
2310 	.connect		= tcp_v4_connect,
2311 	.disconnect		= tcp_disconnect,
2312 	.accept			= inet_csk_accept,
2313 	.ioctl			= tcp_ioctl,
2314 	.init			= tcp_v4_init_sock,
2315 	.destroy		= tcp_v4_destroy_sock,
2316 	.shutdown		= tcp_shutdown,
2317 	.setsockopt		= tcp_setsockopt,
2318 	.getsockopt		= tcp_getsockopt,
2319 	.recvmsg		= tcp_recvmsg,
2320 	.sendmsg		= tcp_sendmsg,
2321 	.sendpage		= tcp_sendpage,
2322 	.backlog_rcv		= tcp_v4_do_rcv,
2323 	.release_cb		= tcp_release_cb,
2324 	.hash			= inet_hash,
2325 	.unhash			= inet_unhash,
2326 	.get_port		= inet_csk_get_port,
2327 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2328 	.stream_memory_free	= tcp_stream_memory_free,
2329 	.sockets_allocated	= &tcp_sockets_allocated,
2330 	.orphan_count		= &tcp_orphan_count,
2331 	.memory_allocated	= &tcp_memory_allocated,
2332 	.memory_pressure	= &tcp_memory_pressure,
2333 	.sysctl_mem		= sysctl_tcp_mem,
2334 	.sysctl_wmem		= sysctl_tcp_wmem,
2335 	.sysctl_rmem		= sysctl_tcp_rmem,
2336 	.max_header		= MAX_TCP_HEADER,
2337 	.obj_size		= sizeof(struct tcp_sock),
2338 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2339 	.twsk_prot		= &tcp_timewait_sock_ops,
2340 	.rsk_prot		= &tcp_request_sock_ops,
2341 	.h.hashinfo		= &tcp_hashinfo,
2342 	.no_autobind		= true,
2343 #ifdef CONFIG_COMPAT
2344 	.compat_setsockopt	= compat_tcp_setsockopt,
2345 	.compat_getsockopt	= compat_tcp_getsockopt,
2346 #endif
2347 #ifdef CONFIG_MEMCG_KMEM
2348 	.init_cgroup		= tcp_init_cgroup,
2349 	.destroy_cgroup		= tcp_destroy_cgroup,
2350 	.proto_cgroup		= tcp_proto_cgroup,
2351 #endif
2352 };
2353 EXPORT_SYMBOL(tcp_prot);
2354 
tcp_sk_exit(struct net * net)2355 static void __net_exit tcp_sk_exit(struct net *net)
2356 {
2357 	int cpu;
2358 
2359 	for_each_possible_cpu(cpu)
2360 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2361 	free_percpu(net->ipv4.tcp_sk);
2362 }
2363 
tcp_sk_init(struct net * net)2364 static int __net_init tcp_sk_init(struct net *net)
2365 {
2366 	int res, cpu;
2367 
2368 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2369 	if (!net->ipv4.tcp_sk)
2370 		return -ENOMEM;
2371 
2372 	for_each_possible_cpu(cpu) {
2373 		struct sock *sk;
2374 
2375 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2376 					   IPPROTO_TCP, net);
2377 		if (res)
2378 			goto fail;
2379 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2380 	}
2381 
2382 	net->ipv4.sysctl_tcp_ecn = 2;
2383 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2384 
2385 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2386 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2387 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2388 
2389 	return 0;
2390 fail:
2391 	tcp_sk_exit(net);
2392 
2393 	return res;
2394 }
2395 
tcp_sk_exit_batch(struct list_head * net_exit_list)2396 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2397 {
2398 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2399 }
2400 
2401 static struct pernet_operations __net_initdata tcp_sk_ops = {
2402        .init	   = tcp_sk_init,
2403        .exit	   = tcp_sk_exit,
2404        .exit_batch = tcp_sk_exit_batch,
2405 };
2406 
tcp_v4_init(void)2407 void __init tcp_v4_init(void)
2408 {
2409 	inet_hashinfo_init(&tcp_hashinfo);
2410 	if (register_pernet_subsys(&tcp_sk_ops))
2411 		panic("Failed to create the TCP control socket.\n");
2412 }
2413