1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16 *		Matthew Dillon, <dillon@apollo.west.oic.com>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Jorge Cwik, <jorge@laser.satlink.net>
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/sysctl.h>
25#include <linux/workqueue.h>
26#include <net/tcp.h>
27#include <net/inet_common.h>
28#include <net/xfrm.h>
29
30int sysctl_tcp_syncookies __read_mostly = 1;
31EXPORT_SYMBOL(sysctl_tcp_syncookies);
32
33int sysctl_tcp_abort_on_overflow __read_mostly;
34
35struct inet_timewait_death_row tcp_death_row = {
36	.sysctl_max_tw_buckets = NR_FILE * 2,
37	.hashinfo	= &tcp_hashinfo,
38};
39EXPORT_SYMBOL_GPL(tcp_death_row);
40
41static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
42{
43	if (seq == s_win)
44		return true;
45	if (after(end_seq, s_win) && before(seq, e_win))
46		return true;
47	return seq == e_win && seq == end_seq;
48}
49
50static enum tcp_tw_status
51tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
52				  const struct sk_buff *skb, int mib_idx)
53{
54	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
55
56	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
57				  &tcptw->tw_last_oow_ack_time)) {
58		/* Send ACK. Note, we do not put the bucket,
59		 * it will be released by caller.
60		 */
61		return TCP_TW_ACK;
62	}
63
64	/* We are rate-limiting, so just release the tw sock and drop skb. */
65	inet_twsk_put(tw);
66	return TCP_TW_SUCCESS;
67}
68
69/*
70 * * Main purpose of TIME-WAIT state is to close connection gracefully,
71 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
72 *   (and, probably, tail of data) and one or more our ACKs are lost.
73 * * What is TIME-WAIT timeout? It is associated with maximal packet
74 *   lifetime in the internet, which results in wrong conclusion, that
75 *   it is set to catch "old duplicate segments" wandering out of their path.
76 *   It is not quite correct. This timeout is calculated so that it exceeds
77 *   maximal retransmission timeout enough to allow to lose one (or more)
78 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
79 * * When TIME-WAIT socket receives RST, it means that another end
80 *   finally closed and we are allowed to kill TIME-WAIT too.
81 * * Second purpose of TIME-WAIT is catching old duplicate segments.
82 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
83 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
84 * * If we invented some more clever way to catch duplicates
85 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
86 *
87 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
88 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
89 * from the very beginning.
90 *
91 * NOTE. With recycling (and later with fin-wait-2) TW bucket
92 * is _not_ stateless. It means, that strictly speaking we must
93 * spinlock it. I do not want! Well, probability of misbehaviour
94 * is ridiculously low and, seems, we could use some mb() tricks
95 * to avoid misread sequence numbers, states etc.  --ANK
96 *
97 * We don't need to initialize tmp_out.sack_ok as we don't use the results
98 */
99enum tcp_tw_status
100tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
101			   const struct tcphdr *th)
102{
103	struct tcp_options_received tmp_opt;
104	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
105	bool paws_reject = false;
106
107	tmp_opt.saw_tstamp = 0;
108	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
109		tcp_parse_options(skb, &tmp_opt, 0, NULL);
110
111		if (tmp_opt.saw_tstamp) {
112			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
113			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
114			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
115			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
116		}
117	}
118
119	if (tw->tw_substate == TCP_FIN_WAIT2) {
120		/* Just repeat all the checks of tcp_rcv_state_process() */
121
122		/* Out of window, send ACK */
123		if (paws_reject ||
124		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
125				   tcptw->tw_rcv_nxt,
126				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
127			return tcp_timewait_check_oow_rate_limit(
128				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
129
130		if (th->rst)
131			goto kill;
132
133		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
134			goto kill_with_rst;
135
136		/* Dup ACK? */
137		if (!th->ack ||
138		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
139		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
140			inet_twsk_put(tw);
141			return TCP_TW_SUCCESS;
142		}
143
144		/* New data or FIN. If new data arrive after half-duplex close,
145		 * reset.
146		 */
147		if (!th->fin ||
148		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
149kill_with_rst:
150			inet_twsk_deschedule(tw);
151			inet_twsk_put(tw);
152			return TCP_TW_RST;
153		}
154
155		/* FIN arrived, enter true time-wait state. */
156		tw->tw_substate	  = TCP_TIME_WAIT;
157		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
158		if (tmp_opt.saw_tstamp) {
159			tcptw->tw_ts_recent_stamp = get_seconds();
160			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
161		}
162
163		if (tcp_death_row.sysctl_tw_recycle &&
164		    tcptw->tw_ts_recent_stamp &&
165		    tcp_tw_remember_stamp(tw))
166			inet_twsk_reschedule(tw, tw->tw_timeout);
167		else
168			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
169		return TCP_TW_ACK;
170	}
171
172	/*
173	 *	Now real TIME-WAIT state.
174	 *
175	 *	RFC 1122:
176	 *	"When a connection is [...] on TIME-WAIT state [...]
177	 *	[a TCP] MAY accept a new SYN from the remote TCP to
178	 *	reopen the connection directly, if it:
179	 *
180	 *	(1)  assigns its initial sequence number for the new
181	 *	connection to be larger than the largest sequence
182	 *	number it used on the previous connection incarnation,
183	 *	and
184	 *
185	 *	(2)  returns to TIME-WAIT state if the SYN turns out
186	 *	to be an old duplicate".
187	 */
188
189	if (!paws_reject &&
190	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
191	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
192		/* In window segment, it may be only reset or bare ack. */
193
194		if (th->rst) {
195			/* This is TIME_WAIT assassination, in two flavors.
196			 * Oh well... nobody has a sufficient solution to this
197			 * protocol bug yet.
198			 */
199			if (sysctl_tcp_rfc1337 == 0) {
200kill:
201				inet_twsk_deschedule(tw);
202				inet_twsk_put(tw);
203				return TCP_TW_SUCCESS;
204			}
205		}
206		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
207
208		if (tmp_opt.saw_tstamp) {
209			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
210			tcptw->tw_ts_recent_stamp = get_seconds();
211		}
212
213		inet_twsk_put(tw);
214		return TCP_TW_SUCCESS;
215	}
216
217	/* Out of window segment.
218
219	   All the segments are ACKed immediately.
220
221	   The only exception is new SYN. We accept it, if it is
222	   not old duplicate and we are not in danger to be killed
223	   by delayed old duplicates. RFC check is that it has
224	   newer sequence number works at rates <40Mbit/sec.
225	   However, if paws works, it is reliable AND even more,
226	   we even may relax silly seq space cutoff.
227
228	   RED-PEN: we violate main RFC requirement, if this SYN will appear
229	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
230	   we must return socket to time-wait state. It is not good,
231	   but not fatal yet.
232	 */
233
234	if (th->syn && !th->rst && !th->ack && !paws_reject &&
235	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
236	     (tmp_opt.saw_tstamp &&
237	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
238		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
239		if (isn == 0)
240			isn++;
241		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
242		return TCP_TW_SYN;
243	}
244
245	if (paws_reject)
246		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
247
248	if (!th->rst) {
249		/* In this case we must reset the TIMEWAIT timer.
250		 *
251		 * If it is ACKless SYN it may be both old duplicate
252		 * and new good SYN with random sequence number <rcv_nxt.
253		 * Do not reschedule in the last case.
254		 */
255		if (paws_reject || th->ack)
256			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
257
258		return tcp_timewait_check_oow_rate_limit(
259			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
260	}
261	inet_twsk_put(tw);
262	return TCP_TW_SUCCESS;
263}
264EXPORT_SYMBOL(tcp_timewait_state_process);
265
266/*
267 * Move a socket to time-wait or dead fin-wait-2 state.
268 */
269void tcp_time_wait(struct sock *sk, int state, int timeo)
270{
271	const struct inet_connection_sock *icsk = inet_csk(sk);
272	const struct tcp_sock *tp = tcp_sk(sk);
273	struct inet_timewait_sock *tw;
274	bool recycle_ok = false;
275
276	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
277		recycle_ok = tcp_remember_stamp(sk);
278
279	tw = inet_twsk_alloc(sk, &tcp_death_row, state);
280
281	if (tw) {
282		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
283		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
284		struct inet_sock *inet = inet_sk(sk);
285
286		tw->tw_transparent	= inet->transparent;
287		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
288		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
289		tcptw->tw_snd_nxt	= tp->snd_nxt;
290		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
291		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
292		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
293		tcptw->tw_ts_offset	= tp->tsoffset;
294		tcptw->tw_last_oow_ack_time = 0;
295
296#if IS_ENABLED(CONFIG_IPV6)
297		if (tw->tw_family == PF_INET6) {
298			struct ipv6_pinfo *np = inet6_sk(sk);
299
300			tw->tw_v6_daddr = sk->sk_v6_daddr;
301			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
302			tw->tw_tclass = np->tclass;
303			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
304			tw->tw_ipv6only = sk->sk_ipv6only;
305		}
306#endif
307
308#ifdef CONFIG_TCP_MD5SIG
309		/*
310		 * The timewait bucket does not have the key DB from the
311		 * sock structure. We just make a quick copy of the
312		 * md5 key being used (if indeed we are using one)
313		 * so the timewait ack generating code has the key.
314		 */
315		do {
316			struct tcp_md5sig_key *key;
317			tcptw->tw_md5_key = NULL;
318			key = tp->af_specific->md5_lookup(sk, sk);
319			if (key) {
320				tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
321				if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
322					BUG();
323			}
324		} while (0);
325#endif
326
327		/* Get the TIME_WAIT timeout firing. */
328		if (timeo < rto)
329			timeo = rto;
330
331		if (recycle_ok) {
332			tw->tw_timeout = rto;
333		} else {
334			tw->tw_timeout = TCP_TIMEWAIT_LEN;
335			if (state == TCP_TIME_WAIT)
336				timeo = TCP_TIMEWAIT_LEN;
337		}
338
339		inet_twsk_schedule(tw, timeo);
340		/* Linkage updates. */
341		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
342		inet_twsk_put(tw);
343	} else {
344		/* Sorry, if we're out of memory, just CLOSE this
345		 * socket up.  We've got bigger problems than
346		 * non-graceful socket closings.
347		 */
348		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
349	}
350
351	tcp_update_metrics(sk);
352	tcp_done(sk);
353}
354
355void tcp_twsk_destructor(struct sock *sk)
356{
357#ifdef CONFIG_TCP_MD5SIG
358	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
359
360	if (twsk->tw_md5_key)
361		kfree_rcu(twsk->tw_md5_key, rcu);
362#endif
363}
364EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
365
366void tcp_openreq_init_rwin(struct request_sock *req,
367			   struct sock *sk, struct dst_entry *dst)
368{
369	struct inet_request_sock *ireq = inet_rsk(req);
370	struct tcp_sock *tp = tcp_sk(sk);
371	__u8 rcv_wscale;
372	int mss = dst_metric_advmss(dst);
373
374	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
375		mss = tp->rx_opt.user_mss;
376
377	/* Set this up on the first call only */
378	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
379
380	/* limit the window selection if the user enforce a smaller rx buffer */
381	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
382	    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
383		req->window_clamp = tcp_full_space(sk);
384
385	/* tcp_full_space because it is guaranteed to be the first packet */
386	tcp_select_initial_window(tcp_full_space(sk),
387		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
388		&req->rcv_wnd,
389		&req->window_clamp,
390		ireq->wscale_ok,
391		&rcv_wscale,
392		dst_metric(dst, RTAX_INITRWND));
393	ireq->rcv_wscale = rcv_wscale;
394}
395EXPORT_SYMBOL(tcp_openreq_init_rwin);
396
397static void tcp_ecn_openreq_child(struct tcp_sock *tp,
398				  const struct request_sock *req)
399{
400	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
401}
402
403void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
404{
405	struct inet_connection_sock *icsk = inet_csk(sk);
406	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
407	bool ca_got_dst = false;
408
409	if (ca_key != TCP_CA_UNSPEC) {
410		const struct tcp_congestion_ops *ca;
411
412		rcu_read_lock();
413		ca = tcp_ca_find_key(ca_key);
414		if (likely(ca && try_module_get(ca->owner))) {
415			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
416			icsk->icsk_ca_ops = ca;
417			ca_got_dst = true;
418		}
419		rcu_read_unlock();
420	}
421
422	/* If no valid choice made yet, assign current system default ca. */
423	if (!ca_got_dst &&
424	    (!icsk->icsk_ca_setsockopt ||
425	     !try_module_get(icsk->icsk_ca_ops->owner)))
426		tcp_assign_congestion_control(sk);
427
428	tcp_set_ca_state(sk, TCP_CA_Open);
429}
430EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
431
432/* This is not only more efficient than what we used to do, it eliminates
433 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
434 *
435 * Actually, we could lots of memory writes here. tp of listening
436 * socket contains all necessary default parameters.
437 */
438struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
439{
440	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
441
442	if (newsk) {
443		const struct inet_request_sock *ireq = inet_rsk(req);
444		struct tcp_request_sock *treq = tcp_rsk(req);
445		struct inet_connection_sock *newicsk = inet_csk(newsk);
446		struct tcp_sock *newtp = tcp_sk(newsk);
447
448		/* Now setup tcp_sock */
449		newtp->pred_flags = 0;
450
451		newtp->rcv_wup = newtp->copied_seq =
452		newtp->rcv_nxt = treq->rcv_isn + 1;
453
454		newtp->snd_sml = newtp->snd_una =
455		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
456
457		tcp_prequeue_init(newtp);
458		INIT_LIST_HEAD(&newtp->tsq_node);
459
460		tcp_init_wl(newtp, treq->rcv_isn);
461
462		newtp->srtt_us = 0;
463		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
464		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
465
466		newtp->packets_out = 0;
467		newtp->retrans_out = 0;
468		newtp->sacked_out = 0;
469		newtp->fackets_out = 0;
470		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
471		tcp_enable_early_retrans(newtp);
472		newtp->tlp_high_seq = 0;
473		newtp->lsndtime = treq->snt_synack;
474		newtp->last_oow_ack_time = 0;
475		newtp->total_retrans = req->num_retrans;
476
477		/* So many TCP implementations out there (incorrectly) count the
478		 * initial SYN frame in their delayed-ACK and congestion control
479		 * algorithms that we must have the following bandaid to talk
480		 * efficiently to them.  -DaveM
481		 */
482		newtp->snd_cwnd = TCP_INIT_CWND;
483		newtp->snd_cwnd_cnt = 0;
484
485		tcp_init_xmit_timers(newsk);
486		__skb_queue_head_init(&newtp->out_of_order_queue);
487		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
488
489		newtp->rx_opt.saw_tstamp = 0;
490
491		newtp->rx_opt.dsack = 0;
492		newtp->rx_opt.num_sacks = 0;
493
494		newtp->urg_data = 0;
495
496		if (sock_flag(newsk, SOCK_KEEPOPEN))
497			inet_csk_reset_keepalive_timer(newsk,
498						       keepalive_time_when(newtp));
499
500		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
501		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
502			if (sysctl_tcp_fack)
503				tcp_enable_fack(newtp);
504		}
505		newtp->window_clamp = req->window_clamp;
506		newtp->rcv_ssthresh = req->rcv_wnd;
507		newtp->rcv_wnd = req->rcv_wnd;
508		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
509		if (newtp->rx_opt.wscale_ok) {
510			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
511			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
512		} else {
513			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
514			newtp->window_clamp = min(newtp->window_clamp, 65535U);
515		}
516		newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
517				  newtp->rx_opt.snd_wscale);
518		newtp->max_window = newtp->snd_wnd;
519
520		if (newtp->rx_opt.tstamp_ok) {
521			newtp->rx_opt.ts_recent = req->ts_recent;
522			newtp->rx_opt.ts_recent_stamp = get_seconds();
523			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
524		} else {
525			newtp->rx_opt.ts_recent_stamp = 0;
526			newtp->tcp_header_len = sizeof(struct tcphdr);
527		}
528		newtp->tsoffset = 0;
529#ifdef CONFIG_TCP_MD5SIG
530		newtp->md5sig_info = NULL;	/*XXX*/
531		if (newtp->af_specific->md5_lookup(sk, newsk))
532			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
533#endif
534		if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
535			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
536		newtp->rx_opt.mss_clamp = req->mss;
537		tcp_ecn_openreq_child(newtp, req);
538		newtp->fastopen_rsk = NULL;
539		newtp->syn_data_acked = 0;
540
541		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
542	}
543	return newsk;
544}
545EXPORT_SYMBOL(tcp_create_openreq_child);
546
547/*
548 * Process an incoming packet for SYN_RECV sockets represented as a
549 * request_sock. Normally sk is the listener socket but for TFO it
550 * points to the child socket.
551 *
552 * XXX (TFO) - The current impl contains a special check for ack
553 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
554 *
555 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
556 */
557
558struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
559			   struct request_sock *req,
560			   bool fastopen)
561{
562	struct tcp_options_received tmp_opt;
563	struct sock *child;
564	const struct tcphdr *th = tcp_hdr(skb);
565	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
566	bool paws_reject = false;
567
568	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
569
570	tmp_opt.saw_tstamp = 0;
571	if (th->doff > (sizeof(struct tcphdr)>>2)) {
572		tcp_parse_options(skb, &tmp_opt, 0, NULL);
573
574		if (tmp_opt.saw_tstamp) {
575			tmp_opt.ts_recent = req->ts_recent;
576			/* We do not store true stamp, but it is not required,
577			 * it can be estimated (approximately)
578			 * from another data.
579			 */
580			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
581			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
582		}
583	}
584
585	/* Check for pure retransmitted SYN. */
586	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
587	    flg == TCP_FLAG_SYN &&
588	    !paws_reject) {
589		/*
590		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
591		 * this case on figure 6 and figure 8, but formal
592		 * protocol description says NOTHING.
593		 * To be more exact, it says that we should send ACK,
594		 * because this segment (at least, if it has no data)
595		 * is out of window.
596		 *
597		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
598		 *  describe SYN-RECV state. All the description
599		 *  is wrong, we cannot believe to it and should
600		 *  rely only on common sense and implementation
601		 *  experience.
602		 *
603		 * Enforce "SYN-ACK" according to figure 8, figure 6
604		 * of RFC793, fixed by RFC1122.
605		 *
606		 * Note that even if there is new data in the SYN packet
607		 * they will be thrown away too.
608		 *
609		 * Reset timer after retransmitting SYNACK, similar to
610		 * the idea of fast retransmit in recovery.
611		 */
612		if (!tcp_oow_rate_limited(sock_net(sk), skb,
613					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
614					  &tcp_rsk(req)->last_oow_ack_time) &&
615
616		    !inet_rtx_syn_ack(sk, req)) {
617			unsigned long expires = jiffies;
618
619			expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
620				       TCP_RTO_MAX);
621			if (!fastopen)
622				mod_timer_pending(&req->rsk_timer, expires);
623			else
624				req->rsk_timer.expires = expires;
625		}
626		return NULL;
627	}
628
629	/* Further reproduces section "SEGMENT ARRIVES"
630	   for state SYN-RECEIVED of RFC793.
631	   It is broken, however, it does not work only
632	   when SYNs are crossed.
633
634	   You would think that SYN crossing is impossible here, since
635	   we should have a SYN_SENT socket (from connect()) on our end,
636	   but this is not true if the crossed SYNs were sent to both
637	   ends by a malicious third party.  We must defend against this,
638	   and to do that we first verify the ACK (as per RFC793, page
639	   36) and reset if it is invalid.  Is this a true full defense?
640	   To convince ourselves, let us consider a way in which the ACK
641	   test can still pass in this 'malicious crossed SYNs' case.
642	   Malicious sender sends identical SYNs (and thus identical sequence
643	   numbers) to both A and B:
644
645		A: gets SYN, seq=7
646		B: gets SYN, seq=7
647
648	   By our good fortune, both A and B select the same initial
649	   send sequence number of seven :-)
650
651		A: sends SYN|ACK, seq=7, ack_seq=8
652		B: sends SYN|ACK, seq=7, ack_seq=8
653
654	   So we are now A eating this SYN|ACK, ACK test passes.  So
655	   does sequence test, SYN is truncated, and thus we consider
656	   it a bare ACK.
657
658	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
659	   bare ACK.  Otherwise, we create an established connection.  Both
660	   ends (listening sockets) accept the new incoming connection and try
661	   to talk to each other. 8-)
662
663	   Note: This case is both harmless, and rare.  Possibility is about the
664	   same as us discovering intelligent life on another plant tomorrow.
665
666	   But generally, we should (RFC lies!) to accept ACK
667	   from SYNACK both here and in tcp_rcv_state_process().
668	   tcp_rcv_state_process() does not, hence, we do not too.
669
670	   Note that the case is absolutely generic:
671	   we cannot optimize anything here without
672	   violating protocol. All the checks must be made
673	   before attempt to create socket.
674	 */
675
676	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
677	 *                  and the incoming segment acknowledges something not yet
678	 *                  sent (the segment carries an unacceptable ACK) ...
679	 *                  a reset is sent."
680	 *
681	 * Invalid ACK: reset will be sent by listening socket.
682	 * Note that the ACK validity check for a Fast Open socket is done
683	 * elsewhere and is checked directly against the child socket rather
684	 * than req because user data may have been sent out.
685	 */
686	if ((flg & TCP_FLAG_ACK) && !fastopen &&
687	    (TCP_SKB_CB(skb)->ack_seq !=
688	     tcp_rsk(req)->snt_isn + 1))
689		return sk;
690
691	/* Also, it would be not so bad idea to check rcv_tsecr, which
692	 * is essentially ACK extension and too early or too late values
693	 * should cause reset in unsynchronized states.
694	 */
695
696	/* RFC793: "first check sequence number". */
697
698	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
699					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
700		/* Out of window: send ACK and drop. */
701		if (!(flg & TCP_FLAG_RST))
702			req->rsk_ops->send_ack(sk, skb, req);
703		if (paws_reject)
704			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
705		return NULL;
706	}
707
708	/* In sequence, PAWS is OK. */
709
710	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
711		req->ts_recent = tmp_opt.rcv_tsval;
712
713	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
714		/* Truncate SYN, it is out of window starting
715		   at tcp_rsk(req)->rcv_isn + 1. */
716		flg &= ~TCP_FLAG_SYN;
717	}
718
719	/* RFC793: "second check the RST bit" and
720	 *	   "fourth, check the SYN bit"
721	 */
722	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
723		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
724		goto embryonic_reset;
725	}
726
727	/* ACK sequence verified above, just make sure ACK is
728	 * set.  If ACK not set, just silently drop the packet.
729	 *
730	 * XXX (TFO) - if we ever allow "data after SYN", the
731	 * following check needs to be removed.
732	 */
733	if (!(flg & TCP_FLAG_ACK))
734		return NULL;
735
736	/* For Fast Open no more processing is needed (sk is the
737	 * child socket).
738	 */
739	if (fastopen)
740		return sk;
741
742	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
743	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
744	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
745		inet_rsk(req)->acked = 1;
746		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
747		return NULL;
748	}
749
750	/* OK, ACK is valid, create big socket and
751	 * feed this segment to it. It will repeat all
752	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
753	 * ESTABLISHED STATE. If it will be dropped after
754	 * socket is created, wait for troubles.
755	 */
756	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
757	if (!child)
758		goto listen_overflow;
759
760	inet_csk_reqsk_queue_drop(sk, req);
761	inet_csk_reqsk_queue_add(sk, req, child);
762	/* Warning: caller must not call reqsk_put(req);
763	 * child stole last reference on it.
764	 */
765	return child;
766
767listen_overflow:
768	if (!sysctl_tcp_abort_on_overflow) {
769		inet_rsk(req)->acked = 1;
770		return NULL;
771	}
772
773embryonic_reset:
774	if (!(flg & TCP_FLAG_RST)) {
775		/* Received a bad SYN pkt - for TFO We try not to reset
776		 * the local connection unless it's really necessary to
777		 * avoid becoming vulnerable to outside attack aiming at
778		 * resetting legit local connections.
779		 */
780		req->rsk_ops->send_reset(sk, skb);
781	} else if (fastopen) { /* received a valid RST pkt */
782		reqsk_fastopen_remove(sk, req, true);
783		tcp_reset(sk);
784	}
785	if (!fastopen) {
786		inet_csk_reqsk_queue_drop(sk, req);
787		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
788	}
789	return NULL;
790}
791EXPORT_SYMBOL(tcp_check_req);
792
793/*
794 * Queue segment on the new socket if the new socket is active,
795 * otherwise we just shortcircuit this and continue with
796 * the new socket.
797 *
798 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
799 * when entering. But other states are possible due to a race condition
800 * where after __inet_lookup_established() fails but before the listener
801 * locked is obtained, other packets cause the same connection to
802 * be created.
803 */
804
805int tcp_child_process(struct sock *parent, struct sock *child,
806		      struct sk_buff *skb)
807{
808	int ret = 0;
809	int state = child->sk_state;
810
811	if (!sock_owned_by_user(child)) {
812		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
813					    skb->len);
814		/* Wakeup parent, send SIGIO */
815		if (state == TCP_SYN_RECV && child->sk_state != state)
816			parent->sk_data_ready(parent);
817	} else {
818		/* Alas, it is possible again, because we do lookup
819		 * in main socket hash table and lock on listening
820		 * socket does not protect us more.
821		 */
822		__sk_add_backlog(child, skb);
823	}
824
825	bh_unlock_sock(child);
826	sock_put(child);
827	return ret;
828}
829EXPORT_SYMBOL(tcp_child_process);
830