1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16 *		Matthew Dillon, <dillon@apollo.west.oic.com>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Jorge Cwik, <jorge@laser.satlink.net>
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/sysctl.h>
25#include <linux/workqueue.h>
26#include <net/tcp.h>
27#include <net/inet_common.h>
28#include <net/xfrm.h>
29
30int sysctl_tcp_syncookies __read_mostly = 1;
31EXPORT_SYMBOL(sysctl_tcp_syncookies);
32
33int sysctl_tcp_abort_on_overflow __read_mostly;
34
35struct inet_timewait_death_row tcp_death_row = {
36	.sysctl_max_tw_buckets = NR_FILE * 2,
37	.hashinfo	= &tcp_hashinfo,
38};
39EXPORT_SYMBOL_GPL(tcp_death_row);
40
41static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
42{
43	if (seq == s_win)
44		return true;
45	if (after(end_seq, s_win) && before(seq, e_win))
46		return true;
47	return seq == e_win && seq == end_seq;
48}
49
50static enum tcp_tw_status
51tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
52				  const struct sk_buff *skb, int mib_idx)
53{
54	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
55
56	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
57				  &tcptw->tw_last_oow_ack_time)) {
58		/* Send ACK. Note, we do not put the bucket,
59		 * it will be released by caller.
60		 */
61		return TCP_TW_ACK;
62	}
63
64	/* We are rate-limiting, so just release the tw sock and drop skb. */
65	inet_twsk_put(tw);
66	return TCP_TW_SUCCESS;
67}
68
69/*
70 * * Main purpose of TIME-WAIT state is to close connection gracefully,
71 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
72 *   (and, probably, tail of data) and one or more our ACKs are lost.
73 * * What is TIME-WAIT timeout? It is associated with maximal packet
74 *   lifetime in the internet, which results in wrong conclusion, that
75 *   it is set to catch "old duplicate segments" wandering out of their path.
76 *   It is not quite correct. This timeout is calculated so that it exceeds
77 *   maximal retransmission timeout enough to allow to lose one (or more)
78 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
79 * * When TIME-WAIT socket receives RST, it means that another end
80 *   finally closed and we are allowed to kill TIME-WAIT too.
81 * * Second purpose of TIME-WAIT is catching old duplicate segments.
82 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
83 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
84 * * If we invented some more clever way to catch duplicates
85 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
86 *
87 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
88 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
89 * from the very beginning.
90 *
91 * NOTE. With recycling (and later with fin-wait-2) TW bucket
92 * is _not_ stateless. It means, that strictly speaking we must
93 * spinlock it. I do not want! Well, probability of misbehaviour
94 * is ridiculously low and, seems, we could use some mb() tricks
95 * to avoid misread sequence numbers, states etc.  --ANK
96 *
97 * We don't need to initialize tmp_out.sack_ok as we don't use the results
98 */
99enum tcp_tw_status
100tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
101			   const struct tcphdr *th)
102{
103	struct tcp_options_received tmp_opt;
104	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
105	bool paws_reject = false;
106
107	tmp_opt.saw_tstamp = 0;
108	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
109		tcp_parse_options(skb, &tmp_opt, 0, NULL);
110
111		if (tmp_opt.saw_tstamp) {
112			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
113			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
114			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
115			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
116		}
117	}
118
119	if (tw->tw_substate == TCP_FIN_WAIT2) {
120		/* Just repeat all the checks of tcp_rcv_state_process() */
121
122		/* Out of window, send ACK */
123		if (paws_reject ||
124		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
125				   tcptw->tw_rcv_nxt,
126				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
127			return tcp_timewait_check_oow_rate_limit(
128				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
129
130		if (th->rst)
131			goto kill;
132
133		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
134			goto kill_with_rst;
135
136		/* Dup ACK? */
137		if (!th->ack ||
138		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
139		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
140			inet_twsk_put(tw);
141			return TCP_TW_SUCCESS;
142		}
143
144		/* New data or FIN. If new data arrive after half-duplex close,
145		 * reset.
146		 */
147		if (!th->fin ||
148		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
149kill_with_rst:
150			inet_twsk_deschedule_put(tw);
151			return TCP_TW_RST;
152		}
153
154		/* FIN arrived, enter true time-wait state. */
155		tw->tw_substate	  = TCP_TIME_WAIT;
156		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
157		if (tmp_opt.saw_tstamp) {
158			tcptw->tw_ts_recent_stamp = get_seconds();
159			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
160		}
161
162		if (tcp_death_row.sysctl_tw_recycle &&
163		    tcptw->tw_ts_recent_stamp &&
164		    tcp_tw_remember_stamp(tw))
165			inet_twsk_reschedule(tw, tw->tw_timeout);
166		else
167			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
168		return TCP_TW_ACK;
169	}
170
171	/*
172	 *	Now real TIME-WAIT state.
173	 *
174	 *	RFC 1122:
175	 *	"When a connection is [...] on TIME-WAIT state [...]
176	 *	[a TCP] MAY accept a new SYN from the remote TCP to
177	 *	reopen the connection directly, if it:
178	 *
179	 *	(1)  assigns its initial sequence number for the new
180	 *	connection to be larger than the largest sequence
181	 *	number it used on the previous connection incarnation,
182	 *	and
183	 *
184	 *	(2)  returns to TIME-WAIT state if the SYN turns out
185	 *	to be an old duplicate".
186	 */
187
188	if (!paws_reject &&
189	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
190	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
191		/* In window segment, it may be only reset or bare ack. */
192
193		if (th->rst) {
194			/* This is TIME_WAIT assassination, in two flavors.
195			 * Oh well... nobody has a sufficient solution to this
196			 * protocol bug yet.
197			 */
198			if (sysctl_tcp_rfc1337 == 0) {
199kill:
200				inet_twsk_deschedule_put(tw);
201				return TCP_TW_SUCCESS;
202			}
203		}
204		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
205
206		if (tmp_opt.saw_tstamp) {
207			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
208			tcptw->tw_ts_recent_stamp = get_seconds();
209		}
210
211		inet_twsk_put(tw);
212		return TCP_TW_SUCCESS;
213	}
214
215	/* Out of window segment.
216
217	   All the segments are ACKed immediately.
218
219	   The only exception is new SYN. We accept it, if it is
220	   not old duplicate and we are not in danger to be killed
221	   by delayed old duplicates. RFC check is that it has
222	   newer sequence number works at rates <40Mbit/sec.
223	   However, if paws works, it is reliable AND even more,
224	   we even may relax silly seq space cutoff.
225
226	   RED-PEN: we violate main RFC requirement, if this SYN will appear
227	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
228	   we must return socket to time-wait state. It is not good,
229	   but not fatal yet.
230	 */
231
232	if (th->syn && !th->rst && !th->ack && !paws_reject &&
233	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
234	     (tmp_opt.saw_tstamp &&
235	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
236		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
237		if (isn == 0)
238			isn++;
239		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
240		return TCP_TW_SYN;
241	}
242
243	if (paws_reject)
244		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
245
246	if (!th->rst) {
247		/* In this case we must reset the TIMEWAIT timer.
248		 *
249		 * If it is ACKless SYN it may be both old duplicate
250		 * and new good SYN with random sequence number <rcv_nxt.
251		 * Do not reschedule in the last case.
252		 */
253		if (paws_reject || th->ack)
254			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
255
256		return tcp_timewait_check_oow_rate_limit(
257			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
258	}
259	inet_twsk_put(tw);
260	return TCP_TW_SUCCESS;
261}
262EXPORT_SYMBOL(tcp_timewait_state_process);
263
264/*
265 * Move a socket to time-wait or dead fin-wait-2 state.
266 */
267void tcp_time_wait(struct sock *sk, int state, int timeo)
268{
269	const struct inet_connection_sock *icsk = inet_csk(sk);
270	const struct tcp_sock *tp = tcp_sk(sk);
271	struct inet_timewait_sock *tw;
272	bool recycle_ok = false;
273
274	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
275		recycle_ok = tcp_remember_stamp(sk);
276
277	tw = inet_twsk_alloc(sk, &tcp_death_row, state);
278
279	if (tw) {
280		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
281		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
282		struct inet_sock *inet = inet_sk(sk);
283
284		tw->tw_transparent	= inet->transparent;
285		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
286		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
287		tcptw->tw_snd_nxt	= tp->snd_nxt;
288		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
289		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
290		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
291		tcptw->tw_ts_offset	= tp->tsoffset;
292		tcptw->tw_last_oow_ack_time = 0;
293
294#if IS_ENABLED(CONFIG_IPV6)
295		if (tw->tw_family == PF_INET6) {
296			struct ipv6_pinfo *np = inet6_sk(sk);
297
298			tw->tw_v6_daddr = sk->sk_v6_daddr;
299			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
300			tw->tw_tclass = np->tclass;
301			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
302			tw->tw_ipv6only = sk->sk_ipv6only;
303		}
304#endif
305
306#ifdef CONFIG_TCP_MD5SIG
307		/*
308		 * The timewait bucket does not have the key DB from the
309		 * sock structure. We just make a quick copy of the
310		 * md5 key being used (if indeed we are using one)
311		 * so the timewait ack generating code has the key.
312		 */
313		do {
314			struct tcp_md5sig_key *key;
315			tcptw->tw_md5_key = NULL;
316			key = tp->af_specific->md5_lookup(sk, sk);
317			if (key) {
318				tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
319				if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
320					BUG();
321			}
322		} while (0);
323#endif
324
325		/* Get the TIME_WAIT timeout firing. */
326		if (timeo < rto)
327			timeo = rto;
328
329		if (recycle_ok) {
330			tw->tw_timeout = rto;
331		} else {
332			tw->tw_timeout = TCP_TIMEWAIT_LEN;
333			if (state == TCP_TIME_WAIT)
334				timeo = TCP_TIMEWAIT_LEN;
335		}
336
337		inet_twsk_schedule(tw, timeo);
338		/* Linkage updates. */
339		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
340		inet_twsk_put(tw);
341	} else {
342		/* Sorry, if we're out of memory, just CLOSE this
343		 * socket up.  We've got bigger problems than
344		 * non-graceful socket closings.
345		 */
346		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
347	}
348
349	tcp_update_metrics(sk);
350	tcp_done(sk);
351}
352
353void tcp_twsk_destructor(struct sock *sk)
354{
355#ifdef CONFIG_TCP_MD5SIG
356	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
357
358	if (twsk->tw_md5_key)
359		kfree_rcu(twsk->tw_md5_key, rcu);
360#endif
361}
362EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
363
364/* Warning : This function is called without sk_listener being locked.
365 * Be sure to read socket fields once, as their value could change under us.
366 */
367void tcp_openreq_init_rwin(struct request_sock *req,
368			   const struct sock *sk_listener,
369			   const struct dst_entry *dst)
370{
371	struct inet_request_sock *ireq = inet_rsk(req);
372	const struct tcp_sock *tp = tcp_sk(sk_listener);
373	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
374	int full_space = tcp_full_space(sk_listener);
375	int mss = dst_metric_advmss(dst);
376	u32 window_clamp;
377	__u8 rcv_wscale;
378
379	if (user_mss && user_mss < mss)
380		mss = user_mss;
381
382	window_clamp = READ_ONCE(tp->window_clamp);
383	/* Set this up on the first call only */
384	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
385
386	/* limit the window selection if the user enforce a smaller rx buffer */
387	if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
388	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
389		req->rsk_window_clamp = full_space;
390
391	/* tcp_full_space because it is guaranteed to be the first packet */
392	tcp_select_initial_window(full_space,
393		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
394		&req->rsk_rcv_wnd,
395		&req->rsk_window_clamp,
396		ireq->wscale_ok,
397		&rcv_wscale,
398		dst_metric(dst, RTAX_INITRWND));
399	ireq->rcv_wscale = rcv_wscale;
400}
401EXPORT_SYMBOL(tcp_openreq_init_rwin);
402
403static void tcp_ecn_openreq_child(struct tcp_sock *tp,
404				  const struct request_sock *req)
405{
406	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
407}
408
409void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
410{
411	struct inet_connection_sock *icsk = inet_csk(sk);
412	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
413	bool ca_got_dst = false;
414
415	if (ca_key != TCP_CA_UNSPEC) {
416		const struct tcp_congestion_ops *ca;
417
418		rcu_read_lock();
419		ca = tcp_ca_find_key(ca_key);
420		if (likely(ca && try_module_get(ca->owner))) {
421			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
422			icsk->icsk_ca_ops = ca;
423			ca_got_dst = true;
424		}
425		rcu_read_unlock();
426	}
427
428	/* If no valid choice made yet, assign current system default ca. */
429	if (!ca_got_dst &&
430	    (!icsk->icsk_ca_setsockopt ||
431	     !try_module_get(icsk->icsk_ca_ops->owner)))
432		tcp_assign_congestion_control(sk);
433
434	tcp_set_ca_state(sk, TCP_CA_Open);
435}
436EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
437
438/* This is not only more efficient than what we used to do, it eliminates
439 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
440 *
441 * Actually, we could lots of memory writes here. tp of listening
442 * socket contains all necessary default parameters.
443 */
444struct sock *tcp_create_openreq_child(const struct sock *sk,
445				      struct request_sock *req,
446				      struct sk_buff *skb)
447{
448	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
449
450	if (newsk) {
451		const struct inet_request_sock *ireq = inet_rsk(req);
452		struct tcp_request_sock *treq = tcp_rsk(req);
453		struct inet_connection_sock *newicsk = inet_csk(newsk);
454		struct tcp_sock *newtp = tcp_sk(newsk);
455
456		/* Now setup tcp_sock */
457		newtp->pred_flags = 0;
458
459		newtp->rcv_wup = newtp->copied_seq =
460		newtp->rcv_nxt = treq->rcv_isn + 1;
461		newtp->segs_in = 1;
462
463		newtp->snd_sml = newtp->snd_una =
464		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
465
466		tcp_prequeue_init(newtp);
467		INIT_LIST_HEAD(&newtp->tsq_node);
468
469		tcp_init_wl(newtp, treq->rcv_isn);
470
471		newtp->srtt_us = 0;
472		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
473		newtp->rtt_min[0].rtt = ~0U;
474		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
475
476		newtp->packets_out = 0;
477		newtp->retrans_out = 0;
478		newtp->sacked_out = 0;
479		newtp->fackets_out = 0;
480		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
481		tcp_enable_early_retrans(newtp);
482		newtp->tlp_high_seq = 0;
483		newtp->lsndtime = treq->snt_synack.stamp_jiffies;
484		newsk->sk_txhash = treq->txhash;
485		newtp->last_oow_ack_time = 0;
486		newtp->total_retrans = req->num_retrans;
487
488		/* So many TCP implementations out there (incorrectly) count the
489		 * initial SYN frame in their delayed-ACK and congestion control
490		 * algorithms that we must have the following bandaid to talk
491		 * efficiently to them.  -DaveM
492		 */
493		newtp->snd_cwnd = TCP_INIT_CWND;
494		newtp->snd_cwnd_cnt = 0;
495
496		tcp_init_xmit_timers(newsk);
497		__skb_queue_head_init(&newtp->out_of_order_queue);
498		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
499
500		newtp->rx_opt.saw_tstamp = 0;
501
502		newtp->rx_opt.dsack = 0;
503		newtp->rx_opt.num_sacks = 0;
504
505		newtp->urg_data = 0;
506
507		if (sock_flag(newsk, SOCK_KEEPOPEN))
508			inet_csk_reset_keepalive_timer(newsk,
509						       keepalive_time_when(newtp));
510
511		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
512		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
513			if (sysctl_tcp_fack)
514				tcp_enable_fack(newtp);
515		}
516		newtp->window_clamp = req->rsk_window_clamp;
517		newtp->rcv_ssthresh = req->rsk_rcv_wnd;
518		newtp->rcv_wnd = req->rsk_rcv_wnd;
519		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
520		if (newtp->rx_opt.wscale_ok) {
521			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
522			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
523		} else {
524			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
525			newtp->window_clamp = min(newtp->window_clamp, 65535U);
526		}
527		newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
528				  newtp->rx_opt.snd_wscale);
529		newtp->max_window = newtp->snd_wnd;
530
531		if (newtp->rx_opt.tstamp_ok) {
532			newtp->rx_opt.ts_recent = req->ts_recent;
533			newtp->rx_opt.ts_recent_stamp = get_seconds();
534			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
535		} else {
536			newtp->rx_opt.ts_recent_stamp = 0;
537			newtp->tcp_header_len = sizeof(struct tcphdr);
538		}
539		newtp->tsoffset = 0;
540#ifdef CONFIG_TCP_MD5SIG
541		newtp->md5sig_info = NULL;	/*XXX*/
542		if (newtp->af_specific->md5_lookup(sk, newsk))
543			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
544#endif
545		if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
546			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
547		newtp->rx_opt.mss_clamp = req->mss;
548		tcp_ecn_openreq_child(newtp, req);
549		newtp->fastopen_rsk = NULL;
550		newtp->syn_data_acked = 0;
551		newtp->rack.mstamp.v64 = 0;
552		newtp->rack.advanced = 0;
553
554		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
555	}
556	return newsk;
557}
558EXPORT_SYMBOL(tcp_create_openreq_child);
559
560/*
561 * Process an incoming packet for SYN_RECV sockets represented as a
562 * request_sock. Normally sk is the listener socket but for TFO it
563 * points to the child socket.
564 *
565 * XXX (TFO) - The current impl contains a special check for ack
566 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
567 *
568 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
569 */
570
571struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
572			   struct request_sock *req,
573			   bool fastopen)
574{
575	struct tcp_options_received tmp_opt;
576	struct sock *child;
577	const struct tcphdr *th = tcp_hdr(skb);
578	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
579	bool paws_reject = false;
580	bool own_req;
581
582	tmp_opt.saw_tstamp = 0;
583	if (th->doff > (sizeof(struct tcphdr)>>2)) {
584		tcp_parse_options(skb, &tmp_opt, 0, NULL);
585
586		if (tmp_opt.saw_tstamp) {
587			tmp_opt.ts_recent = req->ts_recent;
588			/* We do not store true stamp, but it is not required,
589			 * it can be estimated (approximately)
590			 * from another data.
591			 */
592			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
593			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
594		}
595	}
596
597	/* Check for pure retransmitted SYN. */
598	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
599	    flg == TCP_FLAG_SYN &&
600	    !paws_reject) {
601		/*
602		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
603		 * this case on figure 6 and figure 8, but formal
604		 * protocol description says NOTHING.
605		 * To be more exact, it says that we should send ACK,
606		 * because this segment (at least, if it has no data)
607		 * is out of window.
608		 *
609		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
610		 *  describe SYN-RECV state. All the description
611		 *  is wrong, we cannot believe to it and should
612		 *  rely only on common sense and implementation
613		 *  experience.
614		 *
615		 * Enforce "SYN-ACK" according to figure 8, figure 6
616		 * of RFC793, fixed by RFC1122.
617		 *
618		 * Note that even if there is new data in the SYN packet
619		 * they will be thrown away too.
620		 *
621		 * Reset timer after retransmitting SYNACK, similar to
622		 * the idea of fast retransmit in recovery.
623		 */
624		if (!tcp_oow_rate_limited(sock_net(sk), skb,
625					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
626					  &tcp_rsk(req)->last_oow_ack_time) &&
627
628		    !inet_rtx_syn_ack(sk, req)) {
629			unsigned long expires = jiffies;
630
631			expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
632				       TCP_RTO_MAX);
633			if (!fastopen)
634				mod_timer_pending(&req->rsk_timer, expires);
635			else
636				req->rsk_timer.expires = expires;
637		}
638		return NULL;
639	}
640
641	/* Further reproduces section "SEGMENT ARRIVES"
642	   for state SYN-RECEIVED of RFC793.
643	   It is broken, however, it does not work only
644	   when SYNs are crossed.
645
646	   You would think that SYN crossing is impossible here, since
647	   we should have a SYN_SENT socket (from connect()) on our end,
648	   but this is not true if the crossed SYNs were sent to both
649	   ends by a malicious third party.  We must defend against this,
650	   and to do that we first verify the ACK (as per RFC793, page
651	   36) and reset if it is invalid.  Is this a true full defense?
652	   To convince ourselves, let us consider a way in which the ACK
653	   test can still pass in this 'malicious crossed SYNs' case.
654	   Malicious sender sends identical SYNs (and thus identical sequence
655	   numbers) to both A and B:
656
657		A: gets SYN, seq=7
658		B: gets SYN, seq=7
659
660	   By our good fortune, both A and B select the same initial
661	   send sequence number of seven :-)
662
663		A: sends SYN|ACK, seq=7, ack_seq=8
664		B: sends SYN|ACK, seq=7, ack_seq=8
665
666	   So we are now A eating this SYN|ACK, ACK test passes.  So
667	   does sequence test, SYN is truncated, and thus we consider
668	   it a bare ACK.
669
670	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
671	   bare ACK.  Otherwise, we create an established connection.  Both
672	   ends (listening sockets) accept the new incoming connection and try
673	   to talk to each other. 8-)
674
675	   Note: This case is both harmless, and rare.  Possibility is about the
676	   same as us discovering intelligent life on another plant tomorrow.
677
678	   But generally, we should (RFC lies!) to accept ACK
679	   from SYNACK both here and in tcp_rcv_state_process().
680	   tcp_rcv_state_process() does not, hence, we do not too.
681
682	   Note that the case is absolutely generic:
683	   we cannot optimize anything here without
684	   violating protocol. All the checks must be made
685	   before attempt to create socket.
686	 */
687
688	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
689	 *                  and the incoming segment acknowledges something not yet
690	 *                  sent (the segment carries an unacceptable ACK) ...
691	 *                  a reset is sent."
692	 *
693	 * Invalid ACK: reset will be sent by listening socket.
694	 * Note that the ACK validity check for a Fast Open socket is done
695	 * elsewhere and is checked directly against the child socket rather
696	 * than req because user data may have been sent out.
697	 */
698	if ((flg & TCP_FLAG_ACK) && !fastopen &&
699	    (TCP_SKB_CB(skb)->ack_seq !=
700	     tcp_rsk(req)->snt_isn + 1))
701		return sk;
702
703	/* Also, it would be not so bad idea to check rcv_tsecr, which
704	 * is essentially ACK extension and too early or too late values
705	 * should cause reset in unsynchronized states.
706	 */
707
708	/* RFC793: "first check sequence number". */
709
710	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
711					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
712		/* Out of window: send ACK and drop. */
713		if (!(flg & TCP_FLAG_RST))
714			req->rsk_ops->send_ack(sk, skb, req);
715		if (paws_reject)
716			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
717		return NULL;
718	}
719
720	/* In sequence, PAWS is OK. */
721
722	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
723		req->ts_recent = tmp_opt.rcv_tsval;
724
725	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
726		/* Truncate SYN, it is out of window starting
727		   at tcp_rsk(req)->rcv_isn + 1. */
728		flg &= ~TCP_FLAG_SYN;
729	}
730
731	/* RFC793: "second check the RST bit" and
732	 *	   "fourth, check the SYN bit"
733	 */
734	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
735		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
736		goto embryonic_reset;
737	}
738
739	/* ACK sequence verified above, just make sure ACK is
740	 * set.  If ACK not set, just silently drop the packet.
741	 *
742	 * XXX (TFO) - if we ever allow "data after SYN", the
743	 * following check needs to be removed.
744	 */
745	if (!(flg & TCP_FLAG_ACK))
746		return NULL;
747
748	/* For Fast Open no more processing is needed (sk is the
749	 * child socket).
750	 */
751	if (fastopen)
752		return sk;
753
754	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
755	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
756	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
757		inet_rsk(req)->acked = 1;
758		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
759		return NULL;
760	}
761
762	/* OK, ACK is valid, create big socket and
763	 * feed this segment to it. It will repeat all
764	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
765	 * ESTABLISHED STATE. If it will be dropped after
766	 * socket is created, wait for troubles.
767	 */
768	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
769							 req, &own_req);
770	if (!child)
771		goto listen_overflow;
772
773	sock_rps_save_rxhash(child, skb);
774	tcp_synack_rtt_meas(child, req);
775	return inet_csk_complete_hashdance(sk, child, req, own_req);
776
777listen_overflow:
778	if (!sysctl_tcp_abort_on_overflow) {
779		inet_rsk(req)->acked = 1;
780		return NULL;
781	}
782
783embryonic_reset:
784	if (!(flg & TCP_FLAG_RST)) {
785		/* Received a bad SYN pkt - for TFO We try not to reset
786		 * the local connection unless it's really necessary to
787		 * avoid becoming vulnerable to outside attack aiming at
788		 * resetting legit local connections.
789		 */
790		req->rsk_ops->send_reset(sk, skb);
791	} else if (fastopen) { /* received a valid RST pkt */
792		reqsk_fastopen_remove(sk, req, true);
793		tcp_reset(sk);
794	}
795	if (!fastopen) {
796		inet_csk_reqsk_queue_drop(sk, req);
797		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
798	}
799	return NULL;
800}
801EXPORT_SYMBOL(tcp_check_req);
802
803/*
804 * Queue segment on the new socket if the new socket is active,
805 * otherwise we just shortcircuit this and continue with
806 * the new socket.
807 *
808 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
809 * when entering. But other states are possible due to a race condition
810 * where after __inet_lookup_established() fails but before the listener
811 * locked is obtained, other packets cause the same connection to
812 * be created.
813 */
814
815int tcp_child_process(struct sock *parent, struct sock *child,
816		      struct sk_buff *skb)
817{
818	int ret = 0;
819	int state = child->sk_state;
820
821	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
822	if (!sock_owned_by_user(child)) {
823		ret = tcp_rcv_state_process(child, skb);
824		/* Wakeup parent, send SIGIO */
825		if (state == TCP_SYN_RECV && child->sk_state != state)
826			parent->sk_data_ready(parent);
827	} else {
828		/* Alas, it is possible again, because we do lookup
829		 * in main socket hash table and lock on listening
830		 * socket does not protect us more.
831		 */
832		__sk_add_backlog(child, skb);
833	}
834
835	bh_unlock_sock(child);
836	sock_put(child);
837	return ret;
838}
839EXPORT_SYMBOL(tcp_child_process);
840