| 1 |
/*- |
| 2 |
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 |
| 3 |
* The Regents of the University of California. All rights reserved. |
| 4 |
* Copyright (c) 2007-2008,2010 |
| 5 |
* Swinburne University of Technology, Melbourne, Australia. |
| 6 |
* Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> |
| 7 |
* Copyright (c) 2010 The FreeBSD Foundation |
| 8 |
* Copyright (c) 2010-2011 Juniper Networks, Inc. |
| 9 |
* All rights reserved. |
| 10 |
* |
| 11 |
* Portions of this software were developed at the Centre for Advanced Internet |
| 12 |
* Architectures, Swinburne University of Technology, by Lawrence Stewart, |
| 13 |
* James Healy and David Hayes, made possible in part by a grant from the Cisco |
| 14 |
* University Research Program Fund at Community Foundation Silicon Valley. |
| 15 |
* |
| 16 |
* Portions of this software were developed at the Centre for Advanced |
| 17 |
* Internet Architectures, Swinburne University of Technology, Melbourne, |
| 18 |
* Australia by David Hayes under sponsorship from the FreeBSD Foundation. |
| 19 |
* |
| 20 |
* Portions of this software were developed by Robert N. M. Watson under |
| 21 |
* contract to Juniper Networks, Inc. |
| 22 |
* |
| 23 |
* Redistribution and use in source and binary forms, with or without |
| 24 |
* modification, are permitted provided that the following conditions |
| 25 |
* are met: |
| 26 |
* 1. Redistributions of source code must retain the above copyright |
| 27 |
* notice, this list of conditions and the following disclaimer. |
| 28 |
* 2. Redistributions in binary form must reproduce the above copyright |
| 29 |
* notice, this list of conditions and the following disclaimer in the |
| 30 |
* documentation and/or other materials provided with the distribution. |
| 31 |
* 4. Neither the name of the University nor the names of its contributors |
| 32 |
* may be used to endorse or promote products derived from this software |
| 33 |
* without specific prior written permission. |
| 34 |
* |
| 35 |
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 36 |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 37 |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 38 |
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 39 |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 40 |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 41 |
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 42 |
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 43 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 44 |
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 45 |
* SUCH DAMAGE. |
| 46 |
* |
| 47 |
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 |
| 48 |
*/ |
| 49 |
|
| 50 |
#include <sys/cdefs.h> |
| 51 |
__FBSDID("$FreeBSD$"); |
| 52 |
|
| 53 |
#include "opt_ipfw.h" /* for ipfw_fwd */ |
| 54 |
#include "opt_inet.h" |
| 55 |
#include "opt_inet6.h" |
| 56 |
#include "opt_ipsec.h" |
| 57 |
#include "opt_tcpdebug.h" |
| 58 |
|
| 59 |
#include <sys/param.h> |
| 60 |
#include <sys/kernel.h> |
| 61 |
#include <sys/hhook.h> |
| 62 |
#include <sys/malloc.h> |
| 63 |
#include <sys/mbuf.h> |
| 64 |
#include <sys/proc.h> /* for proc0 declaration */ |
| 65 |
#include <sys/protosw.h> |
| 66 |
#include <sys/signalvar.h> |
| 67 |
#include <sys/socket.h> |
| 68 |
#include <sys/socketvar.h> |
| 69 |
#include <sys/sysctl.h> |
| 70 |
#include <sys/syslog.h> |
| 71 |
#include <sys/systm.h> |
| 72 |
|
| 73 |
#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ |
| 74 |
|
| 75 |
#include <vm/uma.h> |
| 76 |
|
| 77 |
#include <net/if.h> |
| 78 |
#include <net/route.h> |
| 79 |
#include <net/vnet.h> |
| 80 |
|
| 81 |
#define TCPSTATES /* for logging */ |
| 82 |
|
| 83 |
#include <netinet/cc.h> |
| 84 |
#include <netinet/in.h> |
| 85 |
#include <netinet/in_pcb.h> |
| 86 |
#include <netinet/in_systm.h> |
| 87 |
#include <netinet/in_var.h> |
| 88 |
#include <netinet/ip.h> |
| 89 |
#include <netinet/ip_icmp.h> /* required for icmp_var.h */ |
| 90 |
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ |
| 91 |
#include <netinet/ip_var.h> |
| 92 |
#include <netinet/ip_options.h> |
| 93 |
#include <netinet/ip6.h> |
| 94 |
#include <netinet/icmp6.h> |
| 95 |
#include <netinet6/in6_pcb.h> |
| 96 |
#include <netinet6/ip6_var.h> |
| 97 |
#include <netinet6/nd6.h> |
| 98 |
#include <netinet/tcp_fsm.h> |
| 99 |
#include <netinet/tcp_seq.h> |
| 100 |
#include <netinet/tcp_timer.h> |
| 101 |
#include <netinet/tcp_var.h> |
| 102 |
#include <netinet6/tcp6_var.h> |
| 103 |
#include <netinet/tcpip.h> |
| 104 |
#include <netinet/tcp_syncache.h> |
| 105 |
#ifdef TCPDEBUG |
| 106 |
#include <netinet/tcp_debug.h> |
| 107 |
#endif /* TCPDEBUG */ |
| 108 |
#ifdef TCP_OFFLOAD |
| 109 |
#include <netinet/tcp_offload.h> |
| 110 |
#endif |
| 111 |
|
| 112 |
#ifdef IPSEC |
| 113 |
#include <netipsec/ipsec.h> |
| 114 |
#include <netipsec/ipsec6.h> |
| 115 |
#endif /*IPSEC*/ |
| 116 |
|
| 117 |
#include <machine/in_cksum.h> |
| 118 |
|
| 119 |
#include <security/mac/mac_framework.h> |
| 120 |
|
| 121 |
const int tcprexmtthresh = 3; |
| 122 |
|
| 123 |
VNET_DEFINE(struct tcpstat, tcpstat); |
| 124 |
SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, |
| 125 |
&VNET_NAME(tcpstat), tcpstat, |
| 126 |
"TCP statistics (struct tcpstat, netinet/tcp_var.h)"); |
| 127 |
|
| 128 |
int tcp_log_in_vain = 0; |
| 129 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, |
| 130 |
&tcp_log_in_vain, 0, |
| 131 |
"Log all incoming TCP segments to closed ports"); |
| 132 |
|
| 133 |
VNET_DEFINE(int, blackhole) = 0; |
| 134 |
#define V_blackhole VNET(blackhole) |
| 135 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, |
| 136 |
&VNET_NAME(blackhole), 0, |
| 137 |
"Do not send RST on segments to closed ports"); |
| 138 |
|
| 139 |
VNET_DEFINE(int, tcp_delack_enabled) = 1; |
| 140 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, |
| 141 |
&VNET_NAME(tcp_delack_enabled), 0, |
| 142 |
"Delay ACK to try and piggyback it onto a data packet"); |
| 143 |
|
| 144 |
VNET_DEFINE(int, drop_synfin) = 0; |
| 145 |
#define V_drop_synfin VNET(drop_synfin) |
| 146 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, |
| 147 |
&VNET_NAME(drop_synfin), 0, |
| 148 |
"Drop TCP packets with SYN+FIN set"); |
| 149 |
|
| 150 |
VNET_DEFINE(int, tcp_do_rfc3042) = 1; |
| 151 |
#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) |
| 152 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, |
| 153 |
&VNET_NAME(tcp_do_rfc3042), 0, |
| 154 |
"Enable RFC 3042 (Limited Transmit)"); |
| 155 |
|
| 156 |
VNET_DEFINE(int, tcp_do_rfc3390) = 1; |
| 157 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, |
| 158 |
&VNET_NAME(tcp_do_rfc3390), 0, |
| 159 |
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); |
| 160 |
|
| 161 |
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, |
| 162 |
"Experimental TCP extensions"); |
| 163 |
|
| 164 |
VNET_DEFINE(int, tcp_do_initcwnd10) = 1; |
| 165 |
SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW, |
| 166 |
&VNET_NAME(tcp_do_initcwnd10), 0, |
| 167 |
"Enable RFC 6928 (Increasing initial CWND to 10)"); |
| 168 |
|
| 169 |
VNET_DEFINE(int, tcp_do_rfc3465) = 1; |
| 170 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, |
| 171 |
&VNET_NAME(tcp_do_rfc3465), 0, |
| 172 |
"Enable RFC 3465 (Appropriate Byte Counting)"); |
| 173 |
|
| 174 |
VNET_DEFINE(int, tcp_abc_l_var) = 2; |
| 175 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, |
| 176 |
&VNET_NAME(tcp_abc_l_var), 2, |
| 177 |
"Cap the max cwnd increment during slow-start to this number of segments"); |
| 178 |
|
| 179 |
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); |
| 180 |
|
| 181 |
VNET_DEFINE(int, tcp_do_ecn) = 0; |
| 182 |
SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, |
| 183 |
&VNET_NAME(tcp_do_ecn), 0, |
| 184 |
"TCP ECN support"); |
| 185 |
|
| 186 |
VNET_DEFINE(int, tcp_ecn_maxretries) = 1; |
| 187 |
SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, |
| 188 |
&VNET_NAME(tcp_ecn_maxretries), 0, |
| 189 |
"Max retries before giving up on ECN"); |
| 190 |
|
| 191 |
VNET_DEFINE(int, tcp_insecure_rst) = 0; |
| 192 |
#define V_tcp_insecure_rst VNET(tcp_insecure_rst) |
| 193 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, |
| 194 |
&VNET_NAME(tcp_insecure_rst), 0, |
| 195 |
"Follow the old (insecure) criteria for accepting RST packets"); |
| 196 |
|
| 197 |
VNET_DEFINE(int, tcp_recvspace) = 1024*64; |
| 198 |
#define V_tcp_recvspace VNET(tcp_recvspace) |
| 199 |
SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, |
| 200 |
&VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); |
| 201 |
|
| 202 |
VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; |
| 203 |
#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) |
| 204 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, |
| 205 |
&VNET_NAME(tcp_do_autorcvbuf), 0, |
| 206 |
"Enable automatic receive buffer sizing"); |
| 207 |
|
| 208 |
VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; |
| 209 |
#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) |
| 210 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, |
| 211 |
&VNET_NAME(tcp_autorcvbuf_inc), 0, |
| 212 |
"Incrementor step size of automatic receive buffer"); |
| 213 |
|
| 214 |
VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; |
| 215 |
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) |
| 216 |
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, |
| 217 |
&VNET_NAME(tcp_autorcvbuf_max), 0, |
| 218 |
"Max size of automatic receive buffer"); |
| 219 |
|
| 220 |
VNET_DEFINE(struct inpcbhead, tcb); |
| 221 |
#define tcb6 tcb /* for KAME src sync over BSD*'s */ |
| 222 |
VNET_DEFINE(struct inpcbinfo, tcbinfo); |
| 223 |
|
| 224 |
static void tcp_dooptions(struct tcpopt *, u_char *, int, int); |
| 225 |
static void tcp_do_segment(struct mbuf *, struct tcphdr *, |
| 226 |
struct socket *, struct tcpcb *, int, int, uint8_t, |
| 227 |
int); |
| 228 |
static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, |
| 229 |
struct tcpcb *, int, int); |
| 230 |
static void tcp_pulloutofband(struct socket *, |
| 231 |
struct tcphdr *, struct mbuf *, int); |
| 232 |
static void tcp_xmit_timer(struct tcpcb *, int); |
| 233 |
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); |
| 234 |
static void inline tcp_fields_to_host(struct tcphdr *); |
| 235 |
#ifdef TCP_SIGNATURE |
| 236 |
static void inline tcp_fields_to_net(struct tcphdr *); |
| 237 |
static int inline tcp_signature_verify_input(struct mbuf *, int, int, |
| 238 |
int, struct tcpopt *, struct tcphdr *, u_int); |
| 239 |
#endif |
| 240 |
static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, |
| 241 |
uint16_t type); |
| 242 |
static void inline cc_conn_init(struct tcpcb *tp); |
| 243 |
static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); |
| 244 |
static void inline hhook_run_tcp_est_in(struct tcpcb *tp, |
| 245 |
struct tcphdr *th, struct tcpopt *to); |
| 246 |
|
| 247 |
/* |
| 248 |
* Kernel module interface for updating tcpstat. The argument is an index |
| 249 |
* into tcpstat treated as an array of u_long. While this encodes the |
| 250 |
* general layout of tcpstat into the caller, it doesn't encode its location, |
| 251 |
* so that future changes to add, for example, per-CPU stats support won't |
| 252 |
* cause binary compatibility problems for kernel modules. |
| 253 |
*/ |
| 254 |
void |
| 255 |
kmod_tcpstat_inc(int statnum) |
| 256 |
{ |
| 257 |
|
| 258 |
(*((u_long *)&V_tcpstat + statnum))++; |
| 259 |
} |
| 260 |
|
| 261 |
/* |
| 262 |
* Wrapper for the TCP established input helper hook. |
| 263 |
*/ |
| 264 |
static void inline |
| 265 |
hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) |
| 266 |
{ |
| 267 |
struct tcp_hhook_data hhook_data; |
| 268 |
|
| 269 |
if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { |
| 270 |
hhook_data.tp = tp; |
| 271 |
hhook_data.th = th; |
| 272 |
hhook_data.to = to; |
| 273 |
|
| 274 |
hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, |
| 275 |
tp->osd); |
| 276 |
} |
| 277 |
} |
| 278 |
|
| 279 |
/* |
| 280 |
* CC wrapper hook functions |
| 281 |
*/ |
| 282 |
static void inline |
| 283 |
cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) |
| 284 |
{ |
| 285 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 286 |
|
| 287 |
tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); |
| 288 |
if (tp->snd_cwnd <= tp->snd_wnd) |
| 289 |
tp->ccv->flags |= CCF_CWND_LIMITED; |
| 290 |
else |
| 291 |
tp->ccv->flags &= ~CCF_CWND_LIMITED; |
| 292 |
|
| 293 |
if (type == CC_ACK) { |
| 294 |
if (tp->snd_cwnd > tp->snd_ssthresh) { |
| 295 |
tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, |
| 296 |
V_tcp_abc_l_var * tp->t_maxseg); |
| 297 |
if (tp->t_bytes_acked >= tp->snd_cwnd) { |
| 298 |
tp->t_bytes_acked -= tp->snd_cwnd; |
| 299 |
tp->ccv->flags |= CCF_ABC_SENTAWND; |
| 300 |
} |
| 301 |
} else { |
| 302 |
tp->ccv->flags &= ~CCF_ABC_SENTAWND; |
| 303 |
tp->t_bytes_acked = 0; |
| 304 |
} |
| 305 |
} |
| 306 |
|
| 307 |
if (CC_ALGO(tp)->ack_received != NULL) { |
| 308 |
/* XXXLAS: Find a way to live without this */ |
| 309 |
tp->ccv->curack = th->th_ack; |
| 310 |
CC_ALGO(tp)->ack_received(tp->ccv, type); |
| 311 |
} |
| 312 |
} |
| 313 |
|
| 314 |
static void inline |
| 315 |
cc_conn_init(struct tcpcb *tp) |
| 316 |
{ |
| 317 |
struct hc_metrics_lite metrics; |
| 318 |
struct inpcb *inp = tp->t_inpcb; |
| 319 |
int rtt; |
| 320 |
|
| 321 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 322 |
|
| 323 |
tcp_hc_get(&inp->inp_inc, &metrics); |
| 324 |
|
| 325 |
if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { |
| 326 |
tp->t_srtt = rtt; |
| 327 |
tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; |
| 328 |
TCPSTAT_INC(tcps_usedrtt); |
| 329 |
if (metrics.rmx_rttvar) { |
| 330 |
tp->t_rttvar = metrics.rmx_rttvar; |
| 331 |
TCPSTAT_INC(tcps_usedrttvar); |
| 332 |
} else { |
| 333 |
/* default variation is +- 1 rtt */ |
| 334 |
tp->t_rttvar = |
| 335 |
tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; |
| 336 |
} |
| 337 |
TCPT_RANGESET(tp->t_rxtcur, |
| 338 |
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, |
| 339 |
tp->t_rttmin, TCPTV_REXMTMAX); |
| 340 |
} |
| 341 |
if (metrics.rmx_ssthresh) { |
| 342 |
/* |
| 343 |
* There's some sort of gateway or interface |
| 344 |
* buffer limit on the path. Use this to set |
| 345 |
* the slow start threshhold, but set the |
| 346 |
* threshold to no less than 2*mss. |
| 347 |
*/ |
| 348 |
tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); |
| 349 |
TCPSTAT_INC(tcps_usedssthresh); |
| 350 |
} |
| 351 |
|
| 352 |
/* |
| 353 |
* Set the initial slow-start flight size. |
| 354 |
* |
| 355 |
* RFC5681 Section 3.1 specifies the default conservative values. |
| 356 |
* RFC3390 specifies slightly more aggressive values. |
| 357 |
* RFC6928 increases it to ten segments. |
| 358 |
* |
| 359 |
* If a SYN or SYN/ACK was lost and retransmitted, we have to |
| 360 |
* reduce the initial CWND to one segment as congestion is likely |
| 361 |
* requiring us to be cautious. |
| 362 |
*/ |
| 363 |
if (tp->snd_cwnd == 1) |
| 364 |
tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ |
| 365 |
else if (V_tcp_do_initcwnd10) |
| 366 |
tp->snd_cwnd = min(10 * tp->t_maxseg, |
| 367 |
max(2 * tp->t_maxseg, 14600)); |
| 368 |
else if (V_tcp_do_rfc3390) |
| 369 |
tp->snd_cwnd = min(4 * tp->t_maxseg, |
| 370 |
max(2 * tp->t_maxseg, 4380)); |
| 371 |
else { |
| 372 |
/* Per RFC5681 Section 3.1 */ |
| 373 |
if (tp->t_maxseg > 2190) |
| 374 |
tp->snd_cwnd = 2 * tp->t_maxseg; |
| 375 |
else if (tp->t_maxseg > 1095) |
| 376 |
tp->snd_cwnd = 3 * tp->t_maxseg; |
| 377 |
else |
| 378 |
tp->snd_cwnd = 4 * tp->t_maxseg; |
| 379 |
} |
| 380 |
|
| 381 |
if (CC_ALGO(tp)->conn_init != NULL) |
| 382 |
CC_ALGO(tp)->conn_init(tp->ccv); |
| 383 |
} |
| 384 |
|
| 385 |
void inline |
| 386 |
cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) |
| 387 |
{ |
| 388 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 389 |
|
| 390 |
switch(type) { |
| 391 |
case CC_NDUPACK: |
| 392 |
if (!IN_FASTRECOVERY(tp->t_flags)) { |
| 393 |
tp->snd_recover = tp->snd_max; |
| 394 |
if (tp->t_flags & TF_ECN_PERMIT) |
| 395 |
tp->t_flags |= TF_ECN_SND_CWR; |
| 396 |
} |
| 397 |
break; |
| 398 |
case CC_ECN: |
| 399 |
if (!IN_CONGRECOVERY(tp->t_flags)) { |
| 400 |
TCPSTAT_INC(tcps_ecn_rcwnd); |
| 401 |
tp->snd_recover = tp->snd_max; |
| 402 |
if (tp->t_flags & TF_ECN_PERMIT) |
| 403 |
tp->t_flags |= TF_ECN_SND_CWR; |
| 404 |
} |
| 405 |
break; |
| 406 |
case CC_RTO: |
| 407 |
tp->t_dupacks = 0; |
| 408 |
tp->t_bytes_acked = 0; |
| 409 |
EXIT_RECOVERY(tp->t_flags); |
| 410 |
tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / |
| 411 |
tp->t_maxseg) * tp->t_maxseg; |
| 412 |
tp->snd_cwnd = tp->t_maxseg; |
| 413 |
break; |
| 414 |
case CC_RTO_ERR: |
| 415 |
TCPSTAT_INC(tcps_sndrexmitbad); |
| 416 |
/* RTO was unnecessary, so reset everything. */ |
| 417 |
tp->snd_cwnd = tp->snd_cwnd_prev; |
| 418 |
tp->snd_ssthresh = tp->snd_ssthresh_prev; |
| 419 |
tp->snd_recover = tp->snd_recover_prev; |
| 420 |
if (tp->t_flags & TF_WASFRECOVERY) |
| 421 |
ENTER_FASTRECOVERY(tp->t_flags); |
| 422 |
if (tp->t_flags & TF_WASCRECOVERY) |
| 423 |
ENTER_CONGRECOVERY(tp->t_flags); |
| 424 |
tp->snd_nxt = tp->snd_max; |
| 425 |
tp->t_flags &= ~TF_PREVVALID; |
| 426 |
tp->t_badrxtwin = 0; |
| 427 |
break; |
| 428 |
} |
| 429 |
|
| 430 |
if (CC_ALGO(tp)->cong_signal != NULL) { |
| 431 |
if (th != NULL) |
| 432 |
tp->ccv->curack = th->th_ack; |
| 433 |
CC_ALGO(tp)->cong_signal(tp->ccv, type); |
| 434 |
} |
| 435 |
} |
| 436 |
|
| 437 |
static void inline |
| 438 |
cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) |
| 439 |
{ |
| 440 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 441 |
|
| 442 |
/* XXXLAS: KASSERT that we're in recovery? */ |
| 443 |
|
| 444 |
if (CC_ALGO(tp)->post_recovery != NULL) { |
| 445 |
tp->ccv->curack = th->th_ack; |
| 446 |
CC_ALGO(tp)->post_recovery(tp->ccv); |
| 447 |
} |
| 448 |
/* XXXLAS: EXIT_RECOVERY ? */ |
| 449 |
tp->t_bytes_acked = 0; |
| 450 |
} |
| 451 |
|
| 452 |
static inline void |
| 453 |
tcp_fields_to_host(struct tcphdr *th) |
| 454 |
{ |
| 455 |
|
| 456 |
th->th_seq = ntohl(th->th_seq); |
| 457 |
th->th_ack = ntohl(th->th_ack); |
| 458 |
th->th_win = ntohs(th->th_win); |
| 459 |
th->th_urp = ntohs(th->th_urp); |
| 460 |
} |
| 461 |
|
| 462 |
#ifdef TCP_SIGNATURE |
| 463 |
static inline void |
| 464 |
tcp_fields_to_net(struct tcphdr *th) |
| 465 |
{ |
| 466 |
|
| 467 |
th->th_seq = htonl(th->th_seq); |
| 468 |
th->th_ack = htonl(th->th_ack); |
| 469 |
th->th_win = htons(th->th_win); |
| 470 |
th->th_urp = htons(th->th_urp); |
| 471 |
} |
| 472 |
|
| 473 |
static inline int |
| 474 |
tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, |
| 475 |
struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) |
| 476 |
{ |
| 477 |
int ret; |
| 478 |
|
| 479 |
tcp_fields_to_net(th); |
| 480 |
ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag); |
| 481 |
tcp_fields_to_host(th); |
| 482 |
return (ret); |
| 483 |
} |
| 484 |
#endif |
| 485 |
|
| 486 |
/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ |
| 487 |
#ifdef INET6 |
| 488 |
#define ND6_HINT(tp) \ |
| 489 |
do { \ |
| 490 |
if ((tp) && (tp)->t_inpcb && \ |
| 491 |
((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ |
| 492 |
nd6_nud_hint(NULL, NULL, 0); \ |
| 493 |
} while (0) |
| 494 |
#else |
| 495 |
#define ND6_HINT(tp) |
| 496 |
#endif |
| 497 |
|
| 498 |
/* |
| 499 |
* Indicate whether this ack should be delayed. We can delay the ack if |
| 500 |
* - there is no delayed ack timer in progress and |
| 501 |
* - our last ack wasn't a 0-sized window. We never want to delay |
| 502 |
* the ack that opens up a 0-sized window and |
| 503 |
* - delayed acks are enabled or |
| 504 |
* - this is a half-synchronized T/TCP connection. |
| 505 |
* - the segment size is not larger than the MSS and LRO wasn't used |
| 506 |
* for this segment. |
| 507 |
*/ |
| 508 |
#define DELAY_ACK(tp, tlen) \ |
| 509 |
((!tcp_timer_active(tp, TT_DELACK) && \ |
| 510 |
(tp->t_flags & TF_RXWIN0SENT) == 0) && \ |
| 511 |
(tlen <= tp->t_maxopd) && \ |
| 512 |
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) |
| 513 |
|
| 514 |
/* |
| 515 |
* TCP input handling is split into multiple parts: |
| 516 |
* tcp6_input is a thin wrapper around tcp_input for the extended |
| 517 |
* ip6_protox[] call format in ip6_input |
| 518 |
* tcp_input handles primary segment validation, inpcb lookup and |
| 519 |
* SYN processing on listen sockets |
| 520 |
* tcp_do_segment processes the ACK and text of the segment for |
| 521 |
* establishing, established and closing connections |
| 522 |
*/ |
| 523 |
#ifdef INET6 |
| 524 |
int |
| 525 |
tcp6_input(struct mbuf **mp, int *offp, int proto) |
| 526 |
{ |
| 527 |
struct mbuf *m = *mp; |
| 528 |
struct in6_ifaddr *ia6; |
| 529 |
|
| 530 |
IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); |
| 531 |
|
| 532 |
/* |
| 533 |
* draft-itojun-ipv6-tcp-to-anycast |
| 534 |
* better place to put this in? |
| 535 |
*/ |
| 536 |
ia6 = ip6_getdstifaddr(m); |
| 537 |
if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { |
| 538 |
struct ip6_hdr *ip6; |
| 539 |
|
| 540 |
ifa_free(&ia6->ia_ifa); |
| 541 |
ip6 = mtod(m, struct ip6_hdr *); |
| 542 |
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, |
| 543 |
(caddr_t)&ip6->ip6_dst - (caddr_t)ip6); |
| 544 |
return IPPROTO_DONE; |
| 545 |
} |
| 546 |
if (ia6) |
| 547 |
ifa_free(&ia6->ia_ifa); |
| 548 |
|
| 549 |
tcp_input(m, *offp); |
| 550 |
return IPPROTO_DONE; |
| 551 |
} |
| 552 |
#endif /* INET6 */ |
| 553 |
|
| 554 |
void |
| 555 |
tcp_input(struct mbuf *m, int off0) |
| 556 |
{ |
| 557 |
struct tcphdr *th = NULL; |
| 558 |
struct ip *ip = NULL; |
| 559 |
#ifdef INET |
| 560 |
struct ipovly *ipov; |
| 561 |
#endif |
| 562 |
struct inpcb *inp = NULL; |
| 563 |
struct tcpcb *tp = NULL; |
| 564 |
struct socket *so = NULL; |
| 565 |
u_char *optp = NULL; |
| 566 |
int optlen = 0; |
| 567 |
#ifdef INET |
| 568 |
int len; |
| 569 |
#endif |
| 570 |
int tlen = 0, off; |
| 571 |
int drop_hdrlen; |
| 572 |
int thflags; |
| 573 |
int rstreason = 0; /* For badport_bandlim accounting purposes */ |
| 574 |
#ifdef TCP_SIGNATURE |
| 575 |
uint8_t sig_checked = 0; |
| 576 |
#endif |
| 577 |
uint8_t iptos = 0; |
| 578 |
struct m_tag *fwd_tag = NULL; |
| 579 |
#ifdef INET6 |
| 580 |
struct ip6_hdr *ip6 = NULL; |
| 581 |
int isipv6; |
| 582 |
#else |
| 583 |
const void *ip6 = NULL; |
| 584 |
#endif /* INET6 */ |
| 585 |
struct tcpopt to; /* options in this segment */ |
| 586 |
char *s = NULL; /* address and port logging */ |
| 587 |
int ti_locked; |
| 588 |
#define TI_UNLOCKED 1 |
| 589 |
#define TI_WLOCKED 2 |
| 590 |
|
| 591 |
#ifdef TCPDEBUG |
| 592 |
/* |
| 593 |
* The size of tcp_saveipgen must be the size of the max ip header, |
| 594 |
* now IPv6. |
| 595 |
*/ |
| 596 |
u_char tcp_saveipgen[IP6_HDR_LEN]; |
| 597 |
struct tcphdr tcp_savetcp; |
| 598 |
short ostate = 0; |
| 599 |
#endif |
| 600 |
|
| 601 |
#ifdef INET6 |
| 602 |
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; |
| 603 |
#endif |
| 604 |
|
| 605 |
to.to_flags = 0; |
| 606 |
TCPSTAT_INC(tcps_rcvtotal); |
| 607 |
|
| 608 |
#ifdef INET6 |
| 609 |
if (isipv6) { |
| 610 |
/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ |
| 611 |
|
| 612 |
if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { |
| 613 |
m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); |
| 614 |
if (m == NULL) { |
| 615 |
TCPSTAT_INC(tcps_rcvshort); |
| 616 |
return; |
| 617 |
} |
| 618 |
} |
| 619 |
|
| 620 |
ip6 = mtod(m, struct ip6_hdr *); |
| 621 |
th = (struct tcphdr *)((caddr_t)ip6 + off0); |
| 622 |
tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; |
| 623 |
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { |
| 624 |
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) |
| 625 |
th->th_sum = m->m_pkthdr.csum_data; |
| 626 |
else |
| 627 |
th->th_sum = in6_cksum_pseudo(ip6, tlen, |
| 628 |
IPPROTO_TCP, m->m_pkthdr.csum_data); |
| 629 |
th->th_sum ^= 0xffff; |
| 630 |
} else |
| 631 |
th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); |
| 632 |
if (th->th_sum) { |
| 633 |
TCPSTAT_INC(tcps_rcvbadsum); |
| 634 |
goto drop; |
| 635 |
} |
| 636 |
|
| 637 |
/* |
| 638 |
* Be proactive about unspecified IPv6 address in source. |
| 639 |
* As we use all-zero to indicate unbounded/unconnected pcb, |
| 640 |
* unspecified IPv6 address can be used to confuse us. |
| 641 |
* |
| 642 |
* Note that packets with unspecified IPv6 destination is |
| 643 |
* already dropped in ip6_input. |
| 644 |
*/ |
| 645 |
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { |
| 646 |
/* XXX stat */ |
| 647 |
goto drop; |
| 648 |
} |
| 649 |
} |
| 650 |
#endif |
| 651 |
#if defined(INET) && defined(INET6) |
| 652 |
else |
| 653 |
#endif |
| 654 |
#ifdef INET |
| 655 |
{ |
| 656 |
/* |
| 657 |
* Get IP and TCP header together in first mbuf. |
| 658 |
* Note: IP leaves IP header in first mbuf. |
| 659 |
*/ |
| 660 |
if (off0 > sizeof (struct ip)) { |
| 661 |
ip_stripoptions(m, (struct mbuf *)0); |
| 662 |
off0 = sizeof(struct ip); |
| 663 |
} |
| 664 |
if (m->m_len < sizeof (struct tcpiphdr)) { |
| 665 |
if ((m = m_pullup(m, sizeof (struct tcpiphdr))) |
| 666 |
== NULL) { |
| 667 |
TCPSTAT_INC(tcps_rcvshort); |
| 668 |
return; |
| 669 |
} |
| 670 |
} |
| 671 |
ip = mtod(m, struct ip *); |
| 672 |
ipov = (struct ipovly *)ip; |
| 673 |
th = (struct tcphdr *)((caddr_t)ip + off0); |
| 674 |
tlen = ip->ip_len; |
| 675 |
|
| 676 |
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { |
| 677 |
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) |
| 678 |
th->th_sum = m->m_pkthdr.csum_data; |
| 679 |
else |
| 680 |
th->th_sum = in_pseudo(ip->ip_src.s_addr, |
| 681 |
ip->ip_dst.s_addr, |
| 682 |
htonl(m->m_pkthdr.csum_data + |
| 683 |
ip->ip_len + |
| 684 |
IPPROTO_TCP)); |
| 685 |
th->th_sum ^= 0xffff; |
| 686 |
#ifdef TCPDEBUG |
| 687 |
ipov->ih_len = (u_short)tlen; |
| 688 |
ipov->ih_len = htons(ipov->ih_len); |
| 689 |
#endif |
| 690 |
} else { |
| 691 |
/* |
| 692 |
* Checksum extended TCP header and data. |
| 693 |
*/ |
| 694 |
len = sizeof (struct ip) + tlen; |
| 695 |
bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); |
| 696 |
ipov->ih_len = (u_short)tlen; |
| 697 |
ipov->ih_len = htons(ipov->ih_len); |
| 698 |
th->th_sum = in_cksum(m, len); |
| 699 |
} |
| 700 |
if (th->th_sum) { |
| 701 |
TCPSTAT_INC(tcps_rcvbadsum); |
| 702 |
goto drop; |
| 703 |
} |
| 704 |
/* Re-initialization for later version check */ |
| 705 |
ip->ip_v = IPVERSION; |
| 706 |
} |
| 707 |
#endif /* INET */ |
| 708 |
|
| 709 |
#ifdef INET6 |
| 710 |
if (isipv6) |
| 711 |
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; |
| 712 |
#endif |
| 713 |
#if defined(INET) && defined(INET6) |
| 714 |
else |
| 715 |
#endif |
| 716 |
#ifdef INET |
| 717 |
iptos = ip->ip_tos; |
| 718 |
#endif |
| 719 |
|
| 720 |
/* |
| 721 |
* Check that TCP offset makes sense, |
| 722 |
* pull out TCP options and adjust length. XXX |
| 723 |
*/ |
| 724 |
off = th->th_off << 2; |
| 725 |
if (off < sizeof (struct tcphdr) || off > tlen) { |
| 726 |
TCPSTAT_INC(tcps_rcvbadoff); |
| 727 |
goto drop; |
| 728 |
} |
| 729 |
tlen -= off; /* tlen is used instead of ti->ti_len */ |
| 730 |
if (off > sizeof (struct tcphdr)) { |
| 731 |
#ifdef INET6 |
| 732 |
if (isipv6) { |
| 733 |
IP6_EXTHDR_CHECK(m, off0, off, ); |
| 734 |
ip6 = mtod(m, struct ip6_hdr *); |
| 735 |
th = (struct tcphdr *)((caddr_t)ip6 + off0); |
| 736 |
} |
| 737 |
#endif |
| 738 |
#if defined(INET) && defined(INET6) |
| 739 |
else |
| 740 |
#endif |
| 741 |
#ifdef INET |
| 742 |
{ |
| 743 |
if (m->m_len < sizeof(struct ip) + off) { |
| 744 |
if ((m = m_pullup(m, sizeof (struct ip) + off)) |
| 745 |
== NULL) { |
| 746 |
TCPSTAT_INC(tcps_rcvshort); |
| 747 |
return; |
| 748 |
} |
| 749 |
ip = mtod(m, struct ip *); |
| 750 |
ipov = (struct ipovly *)ip; |
| 751 |
th = (struct tcphdr *)((caddr_t)ip + off0); |
| 752 |
} |
| 753 |
} |
| 754 |
#endif |
| 755 |
optlen = off - sizeof (struct tcphdr); |
| 756 |
optp = (u_char *)(th + 1); |
| 757 |
} |
| 758 |
thflags = th->th_flags; |
| 759 |
|
| 760 |
/* |
| 761 |
* Convert TCP protocol specific fields to host format. |
| 762 |
*/ |
| 763 |
tcp_fields_to_host(th); |
| 764 |
|
| 765 |
/* |
| 766 |
* Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. |
| 767 |
*/ |
| 768 |
drop_hdrlen = off0 + off; |
| 769 |
|
| 770 |
/* |
| 771 |
* Locate pcb for segment; if we're likely to add or remove a |
| 772 |
* connection then first acquire pcbinfo lock. There are two cases |
| 773 |
* where we might discover later we need a write lock despite the |
| 774 |
* flags: ACKs moving a connection out of the syncache, and ACKs for |
| 775 |
* a connection in TIMEWAIT. |
| 776 |
*/ |
| 777 |
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { |
| 778 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 779 |
ti_locked = TI_WLOCKED; |
| 780 |
} else |
| 781 |
ti_locked = TI_UNLOCKED; |
| 782 |
|
| 783 |
findpcb: |
| 784 |
#ifdef INVARIANTS |
| 785 |
if (ti_locked == TI_WLOCKED) { |
| 786 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 787 |
} else { |
| 788 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 789 |
} |
| 790 |
#endif |
| 791 |
|
| 792 |
/* |
| 793 |
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. |
| 794 |
*/ |
| 795 |
if ( |
| 796 |
#ifdef INET6 |
| 797 |
(isipv6 && (m->m_flags & M_IP6_NEXTHOP)) |
| 798 |
#ifdef INET |
| 799 |
|| (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) |
| 800 |
#endif |
| 801 |
#endif |
| 802 |
#if defined(INET) && !defined(INET6) |
| 803 |
(m->m_flags & M_IP_NEXTHOP) |
| 804 |
#endif |
| 805 |
) |
| 806 |
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); |
| 807 |
|
| 808 |
#ifdef INET6 |
| 809 |
if (isipv6 && fwd_tag != NULL) { |
| 810 |
struct sockaddr_in6 *next_hop6; |
| 811 |
|
| 812 |
next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); |
| 813 |
/* |
| 814 |
* Transparently forwarded. Pretend to be the destination. |
| 815 |
* Already got one like this? |
| 816 |
*/ |
| 817 |
inp = in6_pcblookup_mbuf(&V_tcbinfo, |
| 818 |
&ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, |
| 819 |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); |
| 820 |
if (!inp) { |
| 821 |
/* |
| 822 |
* It's new. Try to find the ambushing socket. |
| 823 |
* Because we've rewritten the destination address, |
| 824 |
* any hardware-generated hash is ignored. |
| 825 |
*/ |
| 826 |
inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, |
| 827 |
th->th_sport, &next_hop6->sin6_addr, |
| 828 |
next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : |
| 829 |
th->th_dport, INPLOOKUP_WILDCARD | |
| 830 |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); |
| 831 |
} |
| 832 |
/* Remove the tag from the packet. We don't need it anymore. */ |
| 833 |
m_tag_delete(m, fwd_tag); |
| 834 |
m->m_flags &= ~M_IP6_NEXTHOP; |
| 835 |
fwd_tag = NULL; |
| 836 |
} else if (isipv6) { |
| 837 |
inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, |
| 838 |
th->th_sport, &ip6->ip6_dst, th->th_dport, |
| 839 |
INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, |
| 840 |
m->m_pkthdr.rcvif, m); |
| 841 |
} |
| 842 |
#endif /* INET6 */ |
| 843 |
#if defined(INET6) && defined(INET) |
| 844 |
else |
| 845 |
#endif |
| 846 |
#ifdef INET |
| 847 |
if (fwd_tag != NULL) { |
| 848 |
struct sockaddr_in *next_hop; |
| 849 |
|
| 850 |
next_hop = (struct sockaddr_in *)(fwd_tag+1); |
| 851 |
/* |
| 852 |
* Transparently forwarded. Pretend to be the destination. |
| 853 |
* already got one like this? |
| 854 |
*/ |
| 855 |
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, |
| 856 |
ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, |
| 857 |
m->m_pkthdr.rcvif, m); |
| 858 |
if (!inp) { |
| 859 |
/* |
| 860 |
* It's new. Try to find the ambushing socket. |
| 861 |
* Because we've rewritten the destination address, |
| 862 |
* any hardware-generated hash is ignored. |
| 863 |
*/ |
| 864 |
inp = in_pcblookup(&V_tcbinfo, ip->ip_src, |
| 865 |
th->th_sport, next_hop->sin_addr, |
| 866 |
next_hop->sin_port ? ntohs(next_hop->sin_port) : |
| 867 |
th->th_dport, INPLOOKUP_WILDCARD | |
| 868 |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); |
| 869 |
} |
| 870 |
/* Remove the tag from the packet. We don't need it anymore. */ |
| 871 |
m_tag_delete(m, fwd_tag); |
| 872 |
m->m_flags &= ~M_IP_NEXTHOP; |
| 873 |
fwd_tag = NULL; |
| 874 |
} else |
| 875 |
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, |
| 876 |
th->th_sport, ip->ip_dst, th->th_dport, |
| 877 |
INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, |
| 878 |
m->m_pkthdr.rcvif, m); |
| 879 |
#endif /* INET */ |
| 880 |
|
| 881 |
/* |
| 882 |
* If the INPCB does not exist then all data in the incoming |
| 883 |
* segment is discarded and an appropriate RST is sent back. |
| 884 |
* XXX MRT Send RST using which routing table? |
| 885 |
*/ |
| 886 |
if (inp == NULL) { |
| 887 |
/* |
| 888 |
* Log communication attempts to ports that are not |
| 889 |
* in use. |
| 890 |
*/ |
| 891 |
if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || |
| 892 |
tcp_log_in_vain == 2) { |
| 893 |
if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) |
| 894 |
log(LOG_INFO, "%s; %s: Connection attempt " |
| 895 |
"to closed port\n", s, __func__); |
| 896 |
} |
| 897 |
/* |
| 898 |
* When blackholing do not respond with a RST but |
| 899 |
* completely ignore the segment and drop it. |
| 900 |
*/ |
| 901 |
if ((V_blackhole == 1 && (thflags & TH_SYN)) || |
| 902 |
V_blackhole == 2) |
| 903 |
goto dropunlock; |
| 904 |
|
| 905 |
rstreason = BANDLIM_RST_CLOSEDPORT; |
| 906 |
goto dropwithreset; |
| 907 |
} |
| 908 |
INP_WLOCK_ASSERT(inp); |
| 909 |
if (!(inp->inp_flags & INP_HW_FLOWID) |
| 910 |
&& (m->m_flags & M_FLOWID) |
| 911 |
&& ((inp->inp_socket == NULL) |
| 912 |
|| !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { |
| 913 |
inp->inp_flags |= INP_HW_FLOWID; |
| 914 |
inp->inp_flags &= ~INP_SW_FLOWID; |
| 915 |
inp->inp_flowid = m->m_pkthdr.flowid; |
| 916 |
} |
| 917 |
#ifdef IPSEC |
| 918 |
#ifdef INET6 |
| 919 |
if (isipv6 && ipsec6_in_reject(m, inp)) { |
| 920 |
IPSEC6STAT_INC(in_polvio); |
| 921 |
goto dropunlock; |
| 922 |
} else |
| 923 |
#endif /* INET6 */ |
| 924 |
if (ipsec4_in_reject(m, inp) != 0) { |
| 925 |
IPSECSTAT_INC(in_polvio); |
| 926 |
goto dropunlock; |
| 927 |
} |
| 928 |
#endif /* IPSEC */ |
| 929 |
|
| 930 |
/* |
| 931 |
* Check the minimum TTL for socket. |
| 932 |
*/ |
| 933 |
if (inp->inp_ip_minttl != 0) { |
| 934 |
#ifdef INET6 |
| 935 |
if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) |
| 936 |
goto dropunlock; |
| 937 |
else |
| 938 |
#endif |
| 939 |
if (inp->inp_ip_minttl > ip->ip_ttl) |
| 940 |
goto dropunlock; |
| 941 |
} |
| 942 |
|
| 943 |
/* |
| 944 |
* A previous connection in TIMEWAIT state is supposed to catch stray |
| 945 |
* or duplicate segments arriving late. If this segment was a |
| 946 |
* legitimate new connection attempt the old INPCB gets removed and |
| 947 |
* we can try again to find a listening socket. |
| 948 |
* |
| 949 |
* At this point, due to earlier optimism, we may hold only an inpcb |
| 950 |
* lock, and not the inpcbinfo write lock. If so, we need to try to |
| 951 |
* acquire it, or if that fails, acquire a reference on the inpcb, |
| 952 |
* drop all locks, acquire a global write lock, and then re-acquire |
| 953 |
* the inpcb lock. We may at that point discover that another thread |
| 954 |
* has tried to free the inpcb, in which case we need to loop back |
| 955 |
* and try to find a new inpcb to deliver to. |
| 956 |
* |
| 957 |
* XXXRW: It may be time to rethink timewait locking. |
| 958 |
*/ |
| 959 |
relocked: |
| 960 |
if (inp->inp_flags & INP_TIMEWAIT) { |
| 961 |
if (ti_locked == TI_UNLOCKED) { |
| 962 |
if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { |
| 963 |
in_pcbref(inp); |
| 964 |
INP_WUNLOCK(inp); |
| 965 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 966 |
ti_locked = TI_WLOCKED; |
| 967 |
INP_WLOCK(inp); |
| 968 |
if (in_pcbrele_wlocked(inp)) { |
| 969 |
inp = NULL; |
| 970 |
goto findpcb; |
| 971 |
} |
| 972 |
} else |
| 973 |
ti_locked = TI_WLOCKED; |
| 974 |
} |
| 975 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 976 |
|
| 977 |
if (thflags & TH_SYN) |
| 978 |
tcp_dooptions(&to, optp, optlen, TO_SYN); |
| 979 |
/* |
| 980 |
* NB: tcp_twcheck unlocks the INP and frees the mbuf. |
| 981 |
*/ |
| 982 |
if (tcp_twcheck(inp, &to, th, m, tlen)) |
| 983 |
goto findpcb; |
| 984 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 985 |
return; |
| 986 |
} |
| 987 |
/* |
| 988 |
* The TCPCB may no longer exist if the connection is winding |
| 989 |
* down or it is in the CLOSED state. Either way we drop the |
| 990 |
* segment and send an appropriate response. |
| 991 |
*/ |
| 992 |
tp = intotcpcb(inp); |
| 993 |
if (tp == NULL || tp->t_state == TCPS_CLOSED) { |
| 994 |
rstreason = BANDLIM_RST_CLOSEDPORT; |
| 995 |
goto dropwithreset; |
| 996 |
} |
| 997 |
|
| 998 |
#ifdef TCP_OFFLOAD |
| 999 |
if (tp->t_flags & TF_TOE) { |
| 1000 |
tcp_offload_input(tp, m); |
| 1001 |
m = NULL; /* consumed by the TOE driver */ |
| 1002 |
goto dropunlock; |
| 1003 |
} |
| 1004 |
#endif |
| 1005 |
|
| 1006 |
/* |
| 1007 |
* We've identified a valid inpcb, but it could be that we need an |
| 1008 |
* inpcbinfo write lock but don't hold it. In this case, attempt to |
| 1009 |
* acquire using the same strategy as the TIMEWAIT case above. If we |
| 1010 |
* relock, we have to jump back to 'relocked' as the connection might |
| 1011 |
* now be in TIMEWAIT. |
| 1012 |
*/ |
| 1013 |
#ifdef INVARIANTS |
| 1014 |
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) |
| 1015 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1016 |
#endif |
| 1017 |
if (tp->t_state != TCPS_ESTABLISHED) { |
| 1018 |
if (ti_locked == TI_UNLOCKED) { |
| 1019 |
if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { |
| 1020 |
in_pcbref(inp); |
| 1021 |
INP_WUNLOCK(inp); |
| 1022 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 1023 |
ti_locked = TI_WLOCKED; |
| 1024 |
INP_WLOCK(inp); |
| 1025 |
if (in_pcbrele_wlocked(inp)) { |
| 1026 |
inp = NULL; |
| 1027 |
goto findpcb; |
| 1028 |
} |
| 1029 |
goto relocked; |
| 1030 |
} else |
| 1031 |
ti_locked = TI_WLOCKED; |
| 1032 |
} |
| 1033 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1034 |
} |
| 1035 |
|
| 1036 |
#ifdef MAC |
| 1037 |
INP_WLOCK_ASSERT(inp); |
| 1038 |
if (mac_inpcb_check_deliver(inp, m)) |
| 1039 |
goto dropunlock; |
| 1040 |
#endif |
| 1041 |
so = inp->inp_socket; |
| 1042 |
KASSERT(so != NULL, ("%s: so == NULL", __func__)); |
| 1043 |
#ifdef TCPDEBUG |
| 1044 |
if (so->so_options & SO_DEBUG) { |
| 1045 |
ostate = tp->t_state; |
| 1046 |
#ifdef INET6 |
| 1047 |
if (isipv6) { |
| 1048 |
bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); |
| 1049 |
} else |
| 1050 |
#endif |
| 1051 |
bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); |
| 1052 |
tcp_savetcp = *th; |
| 1053 |
} |
| 1054 |
#endif /* TCPDEBUG */ |
| 1055 |
/* |
| 1056 |
* When the socket is accepting connections (the INPCB is in LISTEN |
| 1057 |
* state) we look into the SYN cache if this is a new connection |
| 1058 |
* attempt or the completion of a previous one. Because listen |
| 1059 |
* sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be |
| 1060 |
* held in this case. |
| 1061 |
*/ |
| 1062 |
if (so->so_options & SO_ACCEPTCONN) { |
| 1063 |
struct in_conninfo inc; |
| 1064 |
|
| 1065 |
KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " |
| 1066 |
"tp not listening", __func__)); |
| 1067 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1068 |
|
| 1069 |
bzero(&inc, sizeof(inc)); |
| 1070 |
#ifdef INET6 |
| 1071 |
if (isipv6) { |
| 1072 |
inc.inc_flags |= INC_ISIPV6; |
| 1073 |
inc.inc6_faddr = ip6->ip6_src; |
| 1074 |
inc.inc6_laddr = ip6->ip6_dst; |
| 1075 |
} else |
| 1076 |
#endif |
| 1077 |
{ |
| 1078 |
inc.inc_faddr = ip->ip_src; |
| 1079 |
inc.inc_laddr = ip->ip_dst; |
| 1080 |
} |
| 1081 |
inc.inc_fport = th->th_sport; |
| 1082 |
inc.inc_lport = th->th_dport; |
| 1083 |
inc.inc_fibnum = so->so_fibnum; |
| 1084 |
|
| 1085 |
/* |
| 1086 |
* Check for an existing connection attempt in syncache if |
| 1087 |
* the flag is only ACK. A successful lookup creates a new |
| 1088 |
* socket appended to the listen queue in SYN_RECEIVED state. |
| 1089 |
*/ |
| 1090 |
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { |
| 1091 |
/* |
| 1092 |
* Parse the TCP options here because |
| 1093 |
* syncookies need access to the reflected |
| 1094 |
* timestamp. |
| 1095 |
*/ |
| 1096 |
tcp_dooptions(&to, optp, optlen, 0); |
| 1097 |
/* |
| 1098 |
* NB: syncache_expand() doesn't unlock |
| 1099 |
* inp and tcpinfo locks. |
| 1100 |
*/ |
| 1101 |
if (!syncache_expand(&inc, &to, th, &so, m)) { |
| 1102 |
/* |
| 1103 |
* No syncache entry or ACK was not |
| 1104 |
* for our SYN/ACK. Send a RST. |
| 1105 |
* NB: syncache did its own logging |
| 1106 |
* of the failure cause. |
| 1107 |
*/ |
| 1108 |
rstreason = BANDLIM_RST_OPENPORT; |
| 1109 |
goto dropwithreset; |
| 1110 |
} |
| 1111 |
if (so == NULL) { |
| 1112 |
/* |
| 1113 |
* We completed the 3-way handshake |
| 1114 |
* but could not allocate a socket |
| 1115 |
* either due to memory shortage, |
| 1116 |
* listen queue length limits or |
| 1117 |
* global socket limits. Send RST |
| 1118 |
* or wait and have the remote end |
| 1119 |
* retransmit the ACK for another |
| 1120 |
* try. |
| 1121 |
*/ |
| 1122 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1123 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1124 |
"Socket allocation failed due to " |
| 1125 |
"limits or memory shortage, %s\n", |
| 1126 |
s, __func__, |
| 1127 |
V_tcp_sc_rst_sock_fail ? |
| 1128 |
"sending RST" : "try again"); |
| 1129 |
if (V_tcp_sc_rst_sock_fail) { |
| 1130 |
rstreason = BANDLIM_UNLIMITED; |
| 1131 |
goto dropwithreset; |
| 1132 |
} else |
| 1133 |
goto dropunlock; |
| 1134 |
} |
| 1135 |
/* |
| 1136 |
* Socket is created in state SYN_RECEIVED. |
| 1137 |
* Unlock the listen socket, lock the newly |
| 1138 |
* created socket and update the tp variable. |
| 1139 |
*/ |
| 1140 |
INP_WUNLOCK(inp); /* listen socket */ |
| 1141 |
inp = sotoinpcb(so); |
| 1142 |
INP_WLOCK(inp); /* new connection */ |
| 1143 |
tp = intotcpcb(inp); |
| 1144 |
KASSERT(tp->t_state == TCPS_SYN_RECEIVED, |
| 1145 |
("%s: ", __func__)); |
| 1146 |
#ifdef TCP_SIGNATURE |
| 1147 |
if (sig_checked == 0) { |
| 1148 |
tcp_dooptions(&to, optp, optlen, |
| 1149 |
(thflags & TH_SYN) ? TO_SYN : 0); |
| 1150 |
if (!tcp_signature_verify_input(m, off0, tlen, |
| 1151 |
optlen, &to, th, tp->t_flags)) { |
| 1152 |
|
| 1153 |
/* |
| 1154 |
* In SYN_SENT state if it receives an |
| 1155 |
* RST, it is allowed for further |
| 1156 |
* processing. |
| 1157 |
*/ |
| 1158 |
if ((thflags & TH_RST) == 0 || |
| 1159 |
(tp->t_state == TCPS_SYN_SENT) == 0) |
| 1160 |
goto dropunlock; |
| 1161 |
} |
| 1162 |
sig_checked = 1; |
| 1163 |
} |
| 1164 |
#endif |
| 1165 |
|
| 1166 |
/* |
| 1167 |
* Process the segment and the data it |
| 1168 |
* contains. tcp_do_segment() consumes |
| 1169 |
* the mbuf chain and unlocks the inpcb. |
| 1170 |
*/ |
| 1171 |
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, |
| 1172 |
iptos, ti_locked); |
| 1173 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1174 |
return; |
| 1175 |
} |
| 1176 |
/* |
| 1177 |
* Segment flag validation for new connection attempts: |
| 1178 |
* |
| 1179 |
* Our (SYN|ACK) response was rejected. |
| 1180 |
* Check with syncache and remove entry to prevent |
| 1181 |
* retransmits. |
| 1182 |
* |
| 1183 |
* NB: syncache_chkrst does its own logging of failure |
| 1184 |
* causes. |
| 1185 |
*/ |
| 1186 |
if (thflags & TH_RST) { |
| 1187 |
syncache_chkrst(&inc, th); |
| 1188 |
goto dropunlock; |
| 1189 |
} |
| 1190 |
/* |
| 1191 |
* We can't do anything without SYN. |
| 1192 |
*/ |
| 1193 |
if ((thflags & TH_SYN) == 0) { |
| 1194 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1195 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1196 |
"SYN is missing, segment ignored\n", |
| 1197 |
s, __func__); |
| 1198 |
TCPSTAT_INC(tcps_badsyn); |
| 1199 |
goto dropunlock; |
| 1200 |
} |
| 1201 |
/* |
| 1202 |
* (SYN|ACK) is bogus on a listen socket. |
| 1203 |
*/ |
| 1204 |
if (thflags & TH_ACK) { |
| 1205 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1206 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1207 |
"SYN|ACK invalid, segment rejected\n", |
| 1208 |
s, __func__); |
| 1209 |
syncache_badack(&inc); /* XXX: Not needed! */ |
| 1210 |
TCPSTAT_INC(tcps_badsyn); |
| 1211 |
rstreason = BANDLIM_RST_OPENPORT; |
| 1212 |
goto dropwithreset; |
| 1213 |
} |
| 1214 |
/* |
| 1215 |
* If the drop_synfin option is enabled, drop all |
| 1216 |
* segments with both the SYN and FIN bits set. |
| 1217 |
* This prevents e.g. nmap from identifying the |
| 1218 |
* TCP/IP stack. |
| 1219 |
* XXX: Poor reasoning. nmap has other methods |
| 1220 |
* and is constantly refining its stack detection |
| 1221 |
* strategies. |
| 1222 |
* XXX: This is a violation of the TCP specification |
| 1223 |
* and was used by RFC1644. |
| 1224 |
*/ |
| 1225 |
if ((thflags & TH_FIN) && V_drop_synfin) { |
| 1226 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1227 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1228 |
"SYN|FIN segment ignored (based on " |
| 1229 |
"sysctl setting)\n", s, __func__); |
| 1230 |
TCPSTAT_INC(tcps_badsyn); |
| 1231 |
goto dropunlock; |
| 1232 |
} |
| 1233 |
/* |
| 1234 |
* Segment's flags are (SYN) or (SYN|FIN). |
| 1235 |
* |
| 1236 |
* TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored |
| 1237 |
* as they do not affect the state of the TCP FSM. |
| 1238 |
* The data pointed to by TH_URG and th_urp is ignored. |
| 1239 |
*/ |
| 1240 |
KASSERT((thflags & (TH_RST|TH_ACK)) == 0, |
| 1241 |
("%s: Listen socket: TH_RST or TH_ACK set", __func__)); |
| 1242 |
KASSERT(thflags & (TH_SYN), |
| 1243 |
("%s: Listen socket: TH_SYN not set", __func__)); |
| 1244 |
#ifdef INET6 |
| 1245 |
/* |
| 1246 |
* If deprecated address is forbidden, |
| 1247 |
* we do not accept SYN to deprecated interface |
| 1248 |
* address to prevent any new inbound connection from |
| 1249 |
* getting established. |
| 1250 |
* When we do not accept SYN, we send a TCP RST, |
| 1251 |
* with deprecated source address (instead of dropping |
| 1252 |
* it). We compromise it as it is much better for peer |
| 1253 |
* to send a RST, and RST will be the final packet |
| 1254 |
* for the exchange. |
| 1255 |
* |
| 1256 |
* If we do not forbid deprecated addresses, we accept |
| 1257 |
* the SYN packet. RFC2462 does not suggest dropping |
| 1258 |
* SYN in this case. |
| 1259 |
* If we decipher RFC2462 5.5.4, it says like this: |
| 1260 |
* 1. use of deprecated addr with existing |
| 1261 |
* communication is okay - "SHOULD continue to be |
| 1262 |
* used" |
| 1263 |
* 2. use of it with new communication: |
| 1264 |
* (2a) "SHOULD NOT be used if alternate address |
| 1265 |
* with sufficient scope is available" |
| 1266 |
* (2b) nothing mentioned otherwise. |
| 1267 |
* Here we fall into (2b) case as we have no choice in |
| 1268 |
* our source address selection - we must obey the peer. |
| 1269 |
* |
| 1270 |
* The wording in RFC2462 is confusing, and there are |
| 1271 |
* multiple description text for deprecated address |
| 1272 |
* handling - worse, they are not exactly the same. |
| 1273 |
* I believe 5.5.4 is the best one, so we follow 5.5.4. |
| 1274 |
*/ |
| 1275 |
if (isipv6 && !V_ip6_use_deprecated) { |
| 1276 |
struct in6_ifaddr *ia6; |
| 1277 |
|
| 1278 |
ia6 = ip6_getdstifaddr(m); |
| 1279 |
if (ia6 != NULL && |
| 1280 |
(ia6->ia6_flags & IN6_IFF_DEPRECATED)) { |
| 1281 |
ifa_free(&ia6->ia_ifa); |
| 1282 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1283 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1284 |
"Connection attempt to deprecated " |
| 1285 |
"IPv6 address rejected\n", |
| 1286 |
s, __func__); |
| 1287 |
rstreason = BANDLIM_RST_OPENPORT; |
| 1288 |
goto dropwithreset; |
| 1289 |
} |
| 1290 |
if (ia6) |
| 1291 |
ifa_free(&ia6->ia_ifa); |
| 1292 |
} |
| 1293 |
#endif /* INET6 */ |
| 1294 |
/* |
| 1295 |
* Basic sanity checks on incoming SYN requests: |
| 1296 |
* Don't respond if the destination is a link layer |
| 1297 |
* broadcast according to RFC1122 4.2.3.10, p. 104. |
| 1298 |
* If it is from this socket it must be forged. |
| 1299 |
* Don't respond if the source or destination is a |
| 1300 |
* global or subnet broad- or multicast address. |
| 1301 |
* Note that it is quite possible to receive unicast |
| 1302 |
* link-layer packets with a broadcast IP address. Use |
| 1303 |
* in_broadcast() to find them. |
| 1304 |
*/ |
| 1305 |
if (m->m_flags & (M_BCAST|M_MCAST)) { |
| 1306 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1307 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1308 |
"Connection attempt from broad- or multicast " |
| 1309 |
"link layer address ignored\n", s, __func__); |
| 1310 |
goto dropunlock; |
| 1311 |
} |
| 1312 |
#ifdef INET6 |
| 1313 |
if (isipv6) { |
| 1314 |
if (th->th_dport == th->th_sport && |
| 1315 |
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { |
| 1316 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1317 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1318 |
"Connection attempt to/from self " |
| 1319 |
"ignored\n", s, __func__); |
| 1320 |
goto dropunlock; |
| 1321 |
} |
| 1322 |
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || |
| 1323 |
IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { |
| 1324 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1325 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1326 |
"Connection attempt from/to multicast " |
| 1327 |
"address ignored\n", s, __func__); |
| 1328 |
goto dropunlock; |
| 1329 |
} |
| 1330 |
} |
| 1331 |
#endif |
| 1332 |
#if defined(INET) && defined(INET6) |
| 1333 |
else |
| 1334 |
#endif |
| 1335 |
#ifdef INET |
| 1336 |
{ |
| 1337 |
if (th->th_dport == th->th_sport && |
| 1338 |
ip->ip_dst.s_addr == ip->ip_src.s_addr) { |
| 1339 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1340 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1341 |
"Connection attempt from/to self " |
| 1342 |
"ignored\n", s, __func__); |
| 1343 |
goto dropunlock; |
| 1344 |
} |
| 1345 |
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || |
| 1346 |
IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || |
| 1347 |
ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || |
| 1348 |
in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { |
| 1349 |
if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) |
| 1350 |
log(LOG_DEBUG, "%s; %s: Listen socket: " |
| 1351 |
"Connection attempt from/to broad- " |
| 1352 |
"or multicast address ignored\n", |
| 1353 |
s, __func__); |
| 1354 |
goto dropunlock; |
| 1355 |
} |
| 1356 |
} |
| 1357 |
#endif |
| 1358 |
/* |
| 1359 |
* SYN appears to be valid. Create compressed TCP state |
| 1360 |
* for syncache. |
| 1361 |
*/ |
| 1362 |
#ifdef TCPDEBUG |
| 1363 |
if (so->so_options & SO_DEBUG) |
| 1364 |
tcp_trace(TA_INPUT, ostate, tp, |
| 1365 |
(void *)tcp_saveipgen, &tcp_savetcp, 0); |
| 1366 |
#endif |
| 1367 |
tcp_dooptions(&to, optp, optlen, TO_SYN); |
| 1368 |
syncache_add(&inc, &to, th, inp, &so, m); |
| 1369 |
/* |
| 1370 |
* Entry added to syncache and mbuf consumed. |
| 1371 |
* Everything already unlocked by syncache_add(). |
| 1372 |
*/ |
| 1373 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1374 |
return; |
| 1375 |
} else if (tp->t_state == TCPS_LISTEN) { |
| 1376 |
/* |
| 1377 |
* When a listen socket is torn down the SO_ACCEPTCONN |
| 1378 |
* flag is removed first while connections are drained |
| 1379 |
* from the accept queue in a unlock/lock cycle of the |
| 1380 |
* ACCEPT_LOCK, opening a race condition allowing a SYN |
| 1381 |
* attempt go through unhandled. |
| 1382 |
*/ |
| 1383 |
goto dropunlock; |
| 1384 |
} |
| 1385 |
|
| 1386 |
#ifdef TCP_SIGNATURE |
| 1387 |
if (sig_checked == 0) { |
| 1388 |
tcp_dooptions(&to, optp, optlen, |
| 1389 |
(thflags & TH_SYN) ? TO_SYN : 0); |
| 1390 |
if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, |
| 1391 |
th, tp->t_flags)) { |
| 1392 |
|
| 1393 |
/* |
| 1394 |
* In SYN_SENT state if it receives an RST, it is |
| 1395 |
* allowed for further processing. |
| 1396 |
*/ |
| 1397 |
if ((thflags & TH_RST) == 0 || |
| 1398 |
(tp->t_state == TCPS_SYN_SENT) == 0) |
| 1399 |
goto dropunlock; |
| 1400 |
} |
| 1401 |
sig_checked = 1; |
| 1402 |
} |
| 1403 |
#endif |
| 1404 |
|
| 1405 |
/* |
| 1406 |
* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later |
| 1407 |
* state. tcp_do_segment() always consumes the mbuf chain, unlocks |
| 1408 |
* the inpcb, and unlocks pcbinfo. |
| 1409 |
*/ |
| 1410 |
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); |
| 1411 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1412 |
return; |
| 1413 |
|
| 1414 |
dropwithreset: |
| 1415 |
if (ti_locked == TI_WLOCKED) { |
| 1416 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1417 |
ti_locked = TI_UNLOCKED; |
| 1418 |
} |
| 1419 |
#ifdef INVARIANTS |
| 1420 |
else { |
| 1421 |
KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " |
| 1422 |
"ti_locked: %d", __func__, ti_locked)); |
| 1423 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1424 |
} |
| 1425 |
#endif |
| 1426 |
|
| 1427 |
if (inp != NULL) { |
| 1428 |
tcp_dropwithreset(m, th, tp, tlen, rstreason); |
| 1429 |
INP_WUNLOCK(inp); |
| 1430 |
} else |
| 1431 |
tcp_dropwithreset(m, th, NULL, tlen, rstreason); |
| 1432 |
m = NULL; /* mbuf chain got consumed. */ |
| 1433 |
goto drop; |
| 1434 |
|
| 1435 |
dropunlock: |
| 1436 |
if (ti_locked == TI_WLOCKED) { |
| 1437 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1438 |
ti_locked = TI_UNLOCKED; |
| 1439 |
} |
| 1440 |
#ifdef INVARIANTS |
| 1441 |
else { |
| 1442 |
KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " |
| 1443 |
"ti_locked: %d", __func__, ti_locked)); |
| 1444 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1445 |
} |
| 1446 |
#endif |
| 1447 |
|
| 1448 |
if (inp != NULL) |
| 1449 |
INP_WUNLOCK(inp); |
| 1450 |
|
| 1451 |
drop: |
| 1452 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1453 |
if (s != NULL) |
| 1454 |
free(s, M_TCPLOG); |
| 1455 |
if (m != NULL) |
| 1456 |
m_freem(m); |
| 1457 |
} |
| 1458 |
|
| 1459 |
static void |
| 1460 |
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, |
| 1461 |
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, |
| 1462 |
int ti_locked) |
| 1463 |
{ |
| 1464 |
int thflags, acked, ourfinisacked, needoutput = 0; |
| 1465 |
int rstreason, todrop, win; |
| 1466 |
u_long tiwin; |
| 1467 |
struct tcpopt to; |
| 1468 |
|
| 1469 |
#ifdef TCPDEBUG |
| 1470 |
/* |
| 1471 |
* The size of tcp_saveipgen must be the size of the max ip header, |
| 1472 |
* now IPv6. |
| 1473 |
*/ |
| 1474 |
u_char tcp_saveipgen[IP6_HDR_LEN]; |
| 1475 |
struct tcphdr tcp_savetcp; |
| 1476 |
short ostate = 0; |
| 1477 |
#endif |
| 1478 |
thflags = th->th_flags; |
| 1479 |
tp->sackhint.last_sack_ack = 0; |
| 1480 |
|
| 1481 |
/* |
| 1482 |
* If this is either a state-changing packet or current state isn't |
| 1483 |
* established, we require a write lock on tcbinfo. Otherwise, we |
| 1484 |
* allow either a read lock or a write lock, as we may have acquired |
| 1485 |
* a write lock due to a race. |
| 1486 |
* |
| 1487 |
* Require a global write lock for SYN/FIN/RST segments or |
| 1488 |
* non-established connections; otherwise accept either a read or |
| 1489 |
* write lock, as we may have conservatively acquired a write lock in |
| 1490 |
* certain cases in tcp_input() (is this still true?). Currently we |
| 1491 |
* will never enter with no lock, so we try to drop it quickly in the |
| 1492 |
* common pure ack/pure data cases. |
| 1493 |
*/ |
| 1494 |
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || |
| 1495 |
tp->t_state != TCPS_ESTABLISHED) { |
| 1496 |
KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " |
| 1497 |
"SYN/FIN/RST/!EST", __func__, ti_locked)); |
| 1498 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1499 |
} else { |
| 1500 |
#ifdef INVARIANTS |
| 1501 |
if (ti_locked == TI_WLOCKED) |
| 1502 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1503 |
else { |
| 1504 |
KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " |
| 1505 |
"ti_locked: %d", __func__, ti_locked)); |
| 1506 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 1507 |
} |
| 1508 |
#endif |
| 1509 |
} |
| 1510 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 1511 |
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", |
| 1512 |
__func__)); |
| 1513 |
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", |
| 1514 |
__func__)); |
| 1515 |
|
| 1516 |
/* |
| 1517 |
* Segment received on connection. |
| 1518 |
* Reset idle time and keep-alive timer. |
| 1519 |
* XXX: This should be done after segment |
| 1520 |
* validation to ignore broken/spoofed segs. |
| 1521 |
*/ |
| 1522 |
tp->t_rcvtime = ticks; |
| 1523 |
if (TCPS_HAVEESTABLISHED(tp->t_state)) |
| 1524 |
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); |
| 1525 |
|
| 1526 |
/* |
| 1527 |
* Unscale the window into a 32-bit value. |
| 1528 |
* For the SYN_SENT state the scale is zero. |
| 1529 |
*/ |
| 1530 |
tiwin = th->th_win << tp->snd_scale; |
| 1531 |
|
| 1532 |
/* |
| 1533 |
* TCP ECN processing. |
| 1534 |
*/ |
| 1535 |
if (tp->t_flags & TF_ECN_PERMIT) { |
| 1536 |
if (thflags & TH_CWR) |
| 1537 |
tp->t_flags &= ~TF_ECN_SND_ECE; |
| 1538 |
switch (iptos & IPTOS_ECN_MASK) { |
| 1539 |
case IPTOS_ECN_CE: |
| 1540 |
tp->t_flags |= TF_ECN_SND_ECE; |
| 1541 |
TCPSTAT_INC(tcps_ecn_ce); |
| 1542 |
break; |
| 1543 |
case IPTOS_ECN_ECT0: |
| 1544 |
TCPSTAT_INC(tcps_ecn_ect0); |
| 1545 |
break; |
| 1546 |
case IPTOS_ECN_ECT1: |
| 1547 |
TCPSTAT_INC(tcps_ecn_ect1); |
| 1548 |
break; |
| 1549 |
} |
| 1550 |
/* Congestion experienced. */ |
| 1551 |
if (thflags & TH_ECE) { |
| 1552 |
cc_cong_signal(tp, th, CC_ECN); |
| 1553 |
} |
| 1554 |
} |
| 1555 |
|
| 1556 |
/* |
| 1557 |
* Parse options on any incoming segment. |
| 1558 |
*/ |
| 1559 |
tcp_dooptions(&to, (u_char *)(th + 1), |
| 1560 |
(th->th_off << 2) - sizeof(struct tcphdr), |
| 1561 |
(thflags & TH_SYN) ? TO_SYN : 0); |
| 1562 |
|
| 1563 |
/* |
| 1564 |
* If echoed timestamp is later than the current time, |
| 1565 |
* fall back to non RFC1323 RTT calculation. Normalize |
| 1566 |
* timestamp if syncookies were used when this connection |
| 1567 |
* was established. |
| 1568 |
*/ |
| 1569 |
if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { |
| 1570 |
to.to_tsecr -= tp->ts_offset; |
| 1571 |
if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) |
| 1572 |
to.to_tsecr = 0; |
| 1573 |
} |
| 1574 |
|
| 1575 |
/* |
| 1576 |
* Process options only when we get SYN/ACK back. The SYN case |
| 1577 |
* for incoming connections is handled in tcp_syncache. |
| 1578 |
* According to RFC1323 the window field in a SYN (i.e., a <SYN> |
| 1579 |
* or <SYN,ACK>) segment itself is never scaled. |
| 1580 |
* XXX this is traditional behavior, may need to be cleaned up. |
| 1581 |
*/ |
| 1582 |
if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { |
| 1583 |
if ((to.to_flags & TOF_SCALE) && |
| 1584 |
(tp->t_flags & TF_REQ_SCALE)) { |
| 1585 |
tp->t_flags |= TF_RCVD_SCALE; |
| 1586 |
tp->snd_scale = to.to_wscale; |
| 1587 |
} |
| 1588 |
/* |
| 1589 |
* Initial send window. It will be updated with |
| 1590 |
* the next incoming segment to the scaled value. |
| 1591 |
*/ |
| 1592 |
tp->snd_wnd = th->th_win; |
| 1593 |
if (to.to_flags & TOF_TS) { |
| 1594 |
tp->t_flags |= TF_RCVD_TSTMP; |
| 1595 |
tp->ts_recent = to.to_tsval; |
| 1596 |
tp->ts_recent_age = tcp_ts_getticks(); |
| 1597 |
} |
| 1598 |
if (to.to_flags & TOF_MSS) |
| 1599 |
tcp_mss(tp, to.to_mss); |
| 1600 |
if ((tp->t_flags & TF_SACK_PERMIT) && |
| 1601 |
(to.to_flags & TOF_SACKPERM) == 0) |
| 1602 |
tp->t_flags &= ~TF_SACK_PERMIT; |
| 1603 |
} |
| 1604 |
|
| 1605 |
/* |
| 1606 |
* Header prediction: check for the two common cases |
| 1607 |
* of a uni-directional data xfer. If the packet has |
| 1608 |
* no control flags, is in-sequence, the window didn't |
| 1609 |
* change and we're not retransmitting, it's a |
| 1610 |
* candidate. If the length is zero and the ack moved |
| 1611 |
* forward, we're the sender side of the xfer. Just |
| 1612 |
* free the data acked & wake any higher level process |
| 1613 |
* that was blocked waiting for space. If the length |
| 1614 |
* is non-zero and the ack didn't move, we're the |
| 1615 |
* receiver side. If we're getting packets in-order |
| 1616 |
* (the reassembly queue is empty), add the data to |
| 1617 |
* the socket buffer and note that we need a delayed ack. |
| 1618 |
* Make sure that the hidden state-flags are also off. |
| 1619 |
* Since we check for TCPS_ESTABLISHED first, it can only |
| 1620 |
* be TH_NEEDSYN. |
| 1621 |
*/ |
| 1622 |
if (tp->t_state == TCPS_ESTABLISHED && |
| 1623 |
th->th_seq == tp->rcv_nxt && |
| 1624 |
(thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && |
| 1625 |
tp->snd_nxt == tp->snd_max && |
| 1626 |
tiwin && tiwin == tp->snd_wnd && |
| 1627 |
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && |
| 1628 |
LIST_EMPTY(&tp->t_segq) && |
| 1629 |
((to.to_flags & TOF_TS) == 0 || |
| 1630 |
TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { |
| 1631 |
|
| 1632 |
/* |
| 1633 |
* If last ACK falls within this segment's sequence numbers, |
| 1634 |
* record the timestamp. |
| 1635 |
* NOTE that the test is modified according to the latest |
| 1636 |
* proposal of the tcplw@cray.com list (Braden 1993/04/26). |
| 1637 |
*/ |
| 1638 |
if ((to.to_flags & TOF_TS) != 0 && |
| 1639 |
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { |
| 1640 |
tp->ts_recent_age = tcp_ts_getticks(); |
| 1641 |
tp->ts_recent = to.to_tsval; |
| 1642 |
} |
| 1643 |
|
| 1644 |
if (tlen == 0) { |
| 1645 |
if (SEQ_GT(th->th_ack, tp->snd_una) && |
| 1646 |
SEQ_LEQ(th->th_ack, tp->snd_max) && |
| 1647 |
!IN_RECOVERY(tp->t_flags) && |
| 1648 |
(to.to_flags & TOF_SACK) == 0 && |
| 1649 |
TAILQ_EMPTY(&tp->snd_holes)) { |
| 1650 |
/* |
| 1651 |
* This is a pure ack for outstanding data. |
| 1652 |
*/ |
| 1653 |
if (ti_locked == TI_WLOCKED) |
| 1654 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1655 |
ti_locked = TI_UNLOCKED; |
| 1656 |
|
| 1657 |
TCPSTAT_INC(tcps_predack); |
| 1658 |
|
| 1659 |
/* |
| 1660 |
* "bad retransmit" recovery. |
| 1661 |
*/ |
| 1662 |
if (tp->t_rxtshift == 1 && |
| 1663 |
tp->t_flags & TF_PREVVALID && |
| 1664 |
(int)(ticks - tp->t_badrxtwin) < 0) { |
| 1665 |
cc_cong_signal(tp, th, CC_RTO_ERR); |
| 1666 |
} |
| 1667 |
|
| 1668 |
/* |
| 1669 |
* Recalculate the transmit timer / rtt. |
| 1670 |
* |
| 1671 |
* Some boxes send broken timestamp replies |
| 1672 |
* during the SYN+ACK phase, ignore |
| 1673 |
* timestamps of 0 or we could calculate a |
| 1674 |
* huge RTT and blow up the retransmit timer. |
| 1675 |
*/ |
| 1676 |
if ((to.to_flags & TOF_TS) != 0 && |
| 1677 |
to.to_tsecr) { |
| 1678 |
u_int t; |
| 1679 |
|
| 1680 |
t = tcp_ts_getticks() - to.to_tsecr; |
| 1681 |
if (!tp->t_rttlow || tp->t_rttlow > t) |
| 1682 |
tp->t_rttlow = t; |
| 1683 |
tcp_xmit_timer(tp, |
| 1684 |
TCP_TS_TO_TICKS(t) + 1); |
| 1685 |
} else if (tp->t_rtttime && |
| 1686 |
SEQ_GT(th->th_ack, tp->t_rtseq)) { |
| 1687 |
if (!tp->t_rttlow || |
| 1688 |
tp->t_rttlow > ticks - tp->t_rtttime) |
| 1689 |
tp->t_rttlow = ticks - tp->t_rtttime; |
| 1690 |
tcp_xmit_timer(tp, |
| 1691 |
ticks - tp->t_rtttime); |
| 1692 |
} |
| 1693 |
acked = BYTES_THIS_ACK(tp, th); |
| 1694 |
|
| 1695 |
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ |
| 1696 |
hhook_run_tcp_est_in(tp, th, &to); |
| 1697 |
|
| 1698 |
TCPSTAT_INC(tcps_rcvackpack); |
| 1699 |
TCPSTAT_ADD(tcps_rcvackbyte, acked); |
| 1700 |
sbdrop(&so->so_snd, acked); |
| 1701 |
if (SEQ_GT(tp->snd_una, tp->snd_recover) && |
| 1702 |
SEQ_LEQ(th->th_ack, tp->snd_recover)) |
| 1703 |
tp->snd_recover = th->th_ack - 1; |
| 1704 |
|
| 1705 |
/* |
| 1706 |
* Let the congestion control algorithm update |
| 1707 |
* congestion control related information. This |
| 1708 |
* typically means increasing the congestion |
| 1709 |
* window. |
| 1710 |
*/ |
| 1711 |
cc_ack_received(tp, th, CC_ACK); |
| 1712 |
|
| 1713 |
tp->snd_una = th->th_ack; |
| 1714 |
/* |
| 1715 |
* Pull snd_wl2 up to prevent seq wrap relative |
| 1716 |
* to th_ack. |
| 1717 |
*/ |
| 1718 |
tp->snd_wl2 = th->th_ack; |
| 1719 |
tp->t_dupacks = 0; |
| 1720 |
m_freem(m); |
| 1721 |
ND6_HINT(tp); /* Some progress has been made. */ |
| 1722 |
|
| 1723 |
/* |
| 1724 |
* If all outstanding data are acked, stop |
| 1725 |
* retransmit timer, otherwise restart timer |
| 1726 |
* using current (possibly backed-off) value. |
| 1727 |
* If process is waiting for space, |
| 1728 |
* wakeup/selwakeup/signal. If data |
| 1729 |
* are ready to send, let tcp_output |
| 1730 |
* decide between more output or persist. |
| 1731 |
*/ |
| 1732 |
#ifdef TCPDEBUG |
| 1733 |
if (so->so_options & SO_DEBUG) |
| 1734 |
tcp_trace(TA_INPUT, ostate, tp, |
| 1735 |
(void *)tcp_saveipgen, |
| 1736 |
&tcp_savetcp, 0); |
| 1737 |
#endif |
| 1738 |
if (tp->snd_una == tp->snd_max) |
| 1739 |
tcp_timer_activate(tp, TT_REXMT, 0); |
| 1740 |
else if (!tcp_timer_active(tp, TT_PERSIST)) |
| 1741 |
tcp_timer_activate(tp, TT_REXMT, |
| 1742 |
tp->t_rxtcur); |
| 1743 |
sowwakeup(so); |
| 1744 |
if (so->so_snd.sb_cc) |
| 1745 |
(void) tcp_output(tp); |
| 1746 |
goto check_delack; |
| 1747 |
} |
| 1748 |
} else if (th->th_ack == tp->snd_una && |
| 1749 |
tlen <= sbspace(&so->so_rcv)) { |
| 1750 |
int newsize = 0; /* automatic sockbuf scaling */ |
| 1751 |
|
| 1752 |
/* |
| 1753 |
* This is a pure, in-sequence data packet with |
| 1754 |
* nothing on the reassembly queue and we have enough |
| 1755 |
* buffer space to take it. |
| 1756 |
*/ |
| 1757 |
if (ti_locked == TI_WLOCKED) |
| 1758 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1759 |
ti_locked = TI_UNLOCKED; |
| 1760 |
|
| 1761 |
/* Clean receiver SACK report if present */ |
| 1762 |
if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) |
| 1763 |
tcp_clean_sackreport(tp); |
| 1764 |
TCPSTAT_INC(tcps_preddat); |
| 1765 |
tp->rcv_nxt += tlen; |
| 1766 |
/* |
| 1767 |
* Pull snd_wl1 up to prevent seq wrap relative to |
| 1768 |
* th_seq. |
| 1769 |
*/ |
| 1770 |
tp->snd_wl1 = th->th_seq; |
| 1771 |
/* |
| 1772 |
* Pull rcv_up up to prevent seq wrap relative to |
| 1773 |
* rcv_nxt. |
| 1774 |
*/ |
| 1775 |
tp->rcv_up = tp->rcv_nxt; |
| 1776 |
TCPSTAT_INC(tcps_rcvpack); |
| 1777 |
TCPSTAT_ADD(tcps_rcvbyte, tlen); |
| 1778 |
ND6_HINT(tp); /* Some progress has been made */ |
| 1779 |
#ifdef TCPDEBUG |
| 1780 |
if (so->so_options & SO_DEBUG) |
| 1781 |
tcp_trace(TA_INPUT, ostate, tp, |
| 1782 |
(void *)tcp_saveipgen, &tcp_savetcp, 0); |
| 1783 |
#endif |
| 1784 |
/* |
| 1785 |
* Automatic sizing of receive socket buffer. Often the send |
| 1786 |
* buffer size is not optimally adjusted to the actual network |
| 1787 |
* conditions at hand (delay bandwidth product). Setting the |
| 1788 |
* buffer size too small limits throughput on links with high |
| 1789 |
* bandwidth and high delay (eg. trans-continental/oceanic links). |
| 1790 |
* |
| 1791 |
* On the receive side the socket buffer memory is only rarely |
| 1792 |
* used to any significant extent. This allows us to be much |
| 1793 |
* more aggressive in scaling the receive socket buffer. For |
| 1794 |
* the case that the buffer space is actually used to a large |
| 1795 |
* extent and we run out of kernel memory we can simply drop |
| 1796 |
* the new segments; TCP on the sender will just retransmit it |
| 1797 |
* later. Setting the buffer size too big may only consume too |
| 1798 |
* much kernel memory if the application doesn't read() from |
| 1799 |
* the socket or packet loss or reordering makes use of the |
| 1800 |
* reassembly queue. |
| 1801 |
* |
| 1802 |
* The criteria to step up the receive buffer one notch are: |
| 1803 |
* 1. the number of bytes received during the time it takes |
| 1804 |
* one timestamp to be reflected back to us (the RTT); |
| 1805 |
* 2. received bytes per RTT is within seven eighth of the |
| 1806 |
* current socket buffer size; |
| 1807 |
* 3. receive buffer size has not hit maximal automatic size; |
| 1808 |
* |
| 1809 |
* This algorithm does one step per RTT at most and only if |
| 1810 |
* we receive a bulk stream w/o packet losses or reorderings. |
| 1811 |
* Shrinking the buffer during idle times is not necessary as |
| 1812 |
* it doesn't consume any memory when idle. |
| 1813 |
* |
| 1814 |
* TODO: Only step up if the application is actually serving |
| 1815 |
* the buffer to better manage the socket buffer resources. |
| 1816 |
*/ |
| 1817 |
if (V_tcp_do_autorcvbuf && |
| 1818 |
to.to_tsecr && |
| 1819 |
(so->so_rcv.sb_flags & SB_AUTOSIZE)) { |
| 1820 |
if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && |
| 1821 |
to.to_tsecr - tp->rfbuf_ts < hz) { |
| 1822 |
if (tp->rfbuf_cnt > |
| 1823 |
(so->so_rcv.sb_hiwat / 8 * 7) && |
| 1824 |
so->so_rcv.sb_hiwat < |
| 1825 |
V_tcp_autorcvbuf_max) { |
| 1826 |
newsize = |
| 1827 |
min(so->so_rcv.sb_hiwat + |
| 1828 |
V_tcp_autorcvbuf_inc, |
| 1829 |
V_tcp_autorcvbuf_max); |
| 1830 |
} |
| 1831 |
/* Start over with next RTT. */ |
| 1832 |
tp->rfbuf_ts = 0; |
| 1833 |
tp->rfbuf_cnt = 0; |
| 1834 |
} else |
| 1835 |
tp->rfbuf_cnt += tlen; /* add up */ |
| 1836 |
} |
| 1837 |
|
| 1838 |
/* Add data to socket buffer. */ |
| 1839 |
SOCKBUF_LOCK(&so->so_rcv); |
| 1840 |
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { |
| 1841 |
m_freem(m); |
| 1842 |
} else { |
| 1843 |
/* |
| 1844 |
* Set new socket buffer size. |
| 1845 |
* Give up when limit is reached. |
| 1846 |
*/ |
| 1847 |
if (newsize) |
| 1848 |
if (!sbreserve_locked(&so->so_rcv, |
| 1849 |
newsize, so, NULL)) |
| 1850 |
so->so_rcv.sb_flags &= ~SB_AUTOSIZE; |
| 1851 |
m_adj(m, drop_hdrlen); /* delayed header drop */ |
| 1852 |
sbappendstream_locked(&so->so_rcv, m); |
| 1853 |
} |
| 1854 |
/* NB: sorwakeup_locked() does an implicit unlock. */ |
| 1855 |
sorwakeup_locked(so); |
| 1856 |
if (DELAY_ACK(tp, tlen)) { |
| 1857 |
tp->t_flags |= TF_DELACK; |
| 1858 |
} else { |
| 1859 |
tp->t_flags |= TF_ACKNOW; |
| 1860 |
tcp_output(tp); |
| 1861 |
} |
| 1862 |
goto check_delack; |
| 1863 |
} |
| 1864 |
} |
| 1865 |
|
| 1866 |
/* |
| 1867 |
* Calculate amount of space in receive window, |
| 1868 |
* and then do TCP input processing. |
| 1869 |
* Receive window is amount of space in rcv queue, |
| 1870 |
* but not less than advertised window. |
| 1871 |
*/ |
| 1872 |
win = sbspace(&so->so_rcv); |
| 1873 |
if (win < 0) |
| 1874 |
win = 0; |
| 1875 |
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); |
| 1876 |
|
| 1877 |
/* Reset receive buffer auto scaling when not in bulk receive mode. */ |
| 1878 |
tp->rfbuf_ts = 0; |
| 1879 |
tp->rfbuf_cnt = 0; |
| 1880 |
|
| 1881 |
switch (tp->t_state) { |
| 1882 |
|
| 1883 |
/* |
| 1884 |
* If the state is SYN_RECEIVED: |
| 1885 |
* if seg contains an ACK, but not for our SYN/ACK, send a RST. |
| 1886 |
*/ |
| 1887 |
case TCPS_SYN_RECEIVED: |
| 1888 |
if ((thflags & TH_ACK) && |
| 1889 |
(SEQ_LEQ(th->th_ack, tp->snd_una) || |
| 1890 |
SEQ_GT(th->th_ack, tp->snd_max))) { |
| 1891 |
rstreason = BANDLIM_RST_OPENPORT; |
| 1892 |
goto dropwithreset; |
| 1893 |
} |
| 1894 |
break; |
| 1895 |
|
| 1896 |
/* |
| 1897 |
* If the state is SYN_SENT: |
| 1898 |
* if seg contains an ACK, but not for our SYN, drop the input. |
| 1899 |
* if seg contains a RST, then drop the connection. |
| 1900 |
* if seg does not contain SYN, then drop it. |
| 1901 |
* Otherwise this is an acceptable SYN segment |
| 1902 |
* initialize tp->rcv_nxt and tp->irs |
| 1903 |
* if seg contains ack then advance tp->snd_una |
| 1904 |
* if seg contains an ECE and ECN support is enabled, the stream |
| 1905 |
* is ECN capable. |
| 1906 |
* if SYN has been acked change to ESTABLISHED else SYN_RCVD state |
| 1907 |
* arrange for segment to be acked (eventually) |
| 1908 |
* continue processing rest of data/controls, beginning with URG |
| 1909 |
*/ |
| 1910 |
case TCPS_SYN_SENT: |
| 1911 |
if ((thflags & TH_ACK) && |
| 1912 |
(SEQ_LEQ(th->th_ack, tp->iss) || |
| 1913 |
SEQ_GT(th->th_ack, tp->snd_max))) { |
| 1914 |
rstreason = BANDLIM_UNLIMITED; |
| 1915 |
goto dropwithreset; |
| 1916 |
} |
| 1917 |
if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) |
| 1918 |
tp = tcp_drop(tp, ECONNREFUSED); |
| 1919 |
if (thflags & TH_RST) |
| 1920 |
goto drop; |
| 1921 |
if (!(thflags & TH_SYN)) |
| 1922 |
goto drop; |
| 1923 |
|
| 1924 |
tp->irs = th->th_seq; |
| 1925 |
tcp_rcvseqinit(tp); |
| 1926 |
if (thflags & TH_ACK) { |
| 1927 |
TCPSTAT_INC(tcps_connects); |
| 1928 |
soisconnected(so); |
| 1929 |
#ifdef MAC |
| 1930 |
mac_socketpeer_set_from_mbuf(m, so); |
| 1931 |
#endif |
| 1932 |
/* Do window scaling on this connection? */ |
| 1933 |
if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 1934 |
(TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 1935 |
tp->rcv_scale = tp->request_r_scale; |
| 1936 |
} |
| 1937 |
tp->rcv_adv += imin(tp->rcv_wnd, |
| 1938 |
TCP_MAXWIN << tp->rcv_scale); |
| 1939 |
tp->snd_una++; /* SYN is acked */ |
| 1940 |
/* |
| 1941 |
* If there's data, delay ACK; if there's also a FIN |
| 1942 |
* ACKNOW will be turned on later. |
| 1943 |
*/ |
| 1944 |
if (DELAY_ACK(tp, tlen) && tlen != 0) |
| 1945 |
tcp_timer_activate(tp, TT_DELACK, |
| 1946 |
tcp_delacktime); |
| 1947 |
else |
| 1948 |
tp->t_flags |= TF_ACKNOW; |
| 1949 |
|
| 1950 |
if ((thflags & TH_ECE) && V_tcp_do_ecn) { |
| 1951 |
tp->t_flags |= TF_ECN_PERMIT; |
| 1952 |
TCPSTAT_INC(tcps_ecn_shs); |
| 1953 |
} |
| 1954 |
|
| 1955 |
/* |
| 1956 |
* Received <SYN,ACK> in SYN_SENT[*] state. |
| 1957 |
* Transitions: |
| 1958 |
* SYN_SENT --> ESTABLISHED |
| 1959 |
* SYN_SENT* --> FIN_WAIT_1 |
| 1960 |
*/ |
| 1961 |
tp->t_starttime = ticks; |
| 1962 |
if (tp->t_flags & TF_NEEDFIN) { |
| 1963 |
tp->t_state = TCPS_FIN_WAIT_1; |
| 1964 |
tp->t_flags &= ~TF_NEEDFIN; |
| 1965 |
thflags &= ~TH_SYN; |
| 1966 |
} else { |
| 1967 |
tp->t_state = TCPS_ESTABLISHED; |
| 1968 |
cc_conn_init(tp); |
| 1969 |
tcp_timer_activate(tp, TT_KEEP, |
| 1970 |
TP_KEEPIDLE(tp)); |
| 1971 |
} |
| 1972 |
} else { |
| 1973 |
/* |
| 1974 |
* Received initial SYN in SYN-SENT[*] state => |
| 1975 |
* simultaneous open. If segment contains CC option |
| 1976 |
* and there is a cached CC, apply TAO test. |
| 1977 |
* If it succeeds, connection is * half-synchronized. |
| 1978 |
* Otherwise, do 3-way handshake: |
| 1979 |
* SYN-SENT -> SYN-RECEIVED |
| 1980 |
* SYN-SENT* -> SYN-RECEIVED* |
| 1981 |
* If there was no CC option, clear cached CC value. |
| 1982 |
*/ |
| 1983 |
tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); |
| 1984 |
tcp_timer_activate(tp, TT_REXMT, 0); |
| 1985 |
tp->t_state = TCPS_SYN_RECEIVED; |
| 1986 |
} |
| 1987 |
|
| 1988 |
KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " |
| 1989 |
"ti_locked %d", __func__, ti_locked)); |
| 1990 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 1991 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 1992 |
|
| 1993 |
/* |
| 1994 |
* Advance th->th_seq to correspond to first data byte. |
| 1995 |
* If data, trim to stay within window, |
| 1996 |
* dropping FIN if necessary. |
| 1997 |
*/ |
| 1998 |
th->th_seq++; |
| 1999 |
if (tlen > tp->rcv_wnd) { |
| 2000 |
todrop = tlen - tp->rcv_wnd; |
| 2001 |
m_adj(m, -todrop); |
| 2002 |
tlen = tp->rcv_wnd; |
| 2003 |
thflags &= ~TH_FIN; |
| 2004 |
TCPSTAT_INC(tcps_rcvpackafterwin); |
| 2005 |
TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); |
| 2006 |
} |
| 2007 |
tp->snd_wl1 = th->th_seq - 1; |
| 2008 |
tp->rcv_up = th->th_seq; |
| 2009 |
/* |
| 2010 |
* Client side of transaction: already sent SYN and data. |
| 2011 |
* If the remote host used T/TCP to validate the SYN, |
| 2012 |
* our data will be ACK'd; if so, enter normal data segment |
| 2013 |
* processing in the middle of step 5, ack processing. |
| 2014 |
* Otherwise, goto step 6. |
| 2015 |
*/ |
| 2016 |
if (thflags & TH_ACK) |
| 2017 |
goto process_ACK; |
| 2018 |
|
| 2019 |
goto step6; |
| 2020 |
|
| 2021 |
/* |
| 2022 |
* If the state is LAST_ACK or CLOSING or TIME_WAIT: |
| 2023 |
* do normal processing. |
| 2024 |
* |
| 2025 |
* NB: Leftover from RFC1644 T/TCP. Cases to be reused later. |
| 2026 |
*/ |
| 2027 |
case TCPS_LAST_ACK: |
| 2028 |
case TCPS_CLOSING: |
| 2029 |
break; /* continue normal processing */ |
| 2030 |
} |
| 2031 |
|
| 2032 |
/* |
| 2033 |
* States other than LISTEN or SYN_SENT. |
| 2034 |
* First check the RST flag and sequence number since reset segments |
| 2035 |
* are exempt from the timestamp and connection count tests. This |
| 2036 |
* fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix |
| 2037 |
* below which allowed reset segments in half the sequence space |
| 2038 |
* to fall though and be processed (which gives forged reset |
| 2039 |
* segments with a random sequence number a 50 percent chance of |
| 2040 |
* killing a connection). |
| 2041 |
* Then check timestamp, if present. |
| 2042 |
* Then check the connection count, if present. |
| 2043 |
* Then check that at least some bytes of segment are within |
| 2044 |
* receive window. If segment begins before rcv_nxt, |
| 2045 |
* drop leading data (and SYN); if nothing left, just ack. |
| 2046 |
* |
| 2047 |
* |
| 2048 |
* If the RST bit is set, check the sequence number to see |
| 2049 |
* if this is a valid reset segment. |
| 2050 |
* RFC 793 page 37: |
| 2051 |
* In all states except SYN-SENT, all reset (RST) segments |
| 2052 |
* are validated by checking their SEQ-fields. A reset is |
| 2053 |
* valid if its sequence number is in the window. |
| 2054 |
* Note: this does not take into account delayed ACKs, so |
| 2055 |
* we should test against last_ack_sent instead of rcv_nxt. |
| 2056 |
* The sequence number in the reset segment is normally an |
| 2057 |
* echo of our outgoing acknowlegement numbers, but some hosts |
| 2058 |
* send a reset with the sequence number at the rightmost edge |
| 2059 |
* of our receive window, and we have to handle this case. |
| 2060 |
* Note 2: Paul Watson's paper "Slipping in the Window" has shown |
| 2061 |
* that brute force RST attacks are possible. To combat this, |
| 2062 |
* we use a much stricter check while in the ESTABLISHED state, |
| 2063 |
* only accepting RSTs where the sequence number is equal to |
| 2064 |
* last_ack_sent. In all other states (the states in which a |
| 2065 |
* RST is more likely), the more permissive check is used. |
| 2066 |
* If we have multiple segments in flight, the initial reset |
| 2067 |
* segment sequence numbers will be to the left of last_ack_sent, |
| 2068 |
* but they will eventually catch up. |
| 2069 |
* In any case, it never made sense to trim reset segments to |
| 2070 |
* fit the receive window since RFC 1122 says: |
| 2071 |
* 4.2.2.12 RST Segment: RFC-793 Section 3.4 |
| 2072 |
* |
| 2073 |
* A TCP SHOULD allow a received RST segment to include data. |
| 2074 |
* |
| 2075 |
* DISCUSSION |
| 2076 |
* It has been suggested that a RST segment could contain |
| 2077 |
* ASCII text that encoded and explained the cause of the |
| 2078 |
* RST. No standard has yet been established for such |
| 2079 |
* data. |
| 2080 |
* |
| 2081 |
* If the reset segment passes the sequence number test examine |
| 2082 |
* the state: |
| 2083 |
* SYN_RECEIVED STATE: |
| 2084 |
* If passive open, return to LISTEN state. |
| 2085 |
* If active open, inform user that connection was refused. |
| 2086 |
* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: |
| 2087 |
* Inform user that connection was reset, and close tcb. |
| 2088 |
* CLOSING, LAST_ACK STATES: |
| 2089 |
* Close the tcb. |
| 2090 |
* TIME_WAIT STATE: |
| 2091 |
* Drop the segment - see Stevens, vol. 2, p. 964 and |
| 2092 |
* RFC 1337. |
| 2093 |
*/ |
| 2094 |
if (thflags & TH_RST) { |
| 2095 |
if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && |
| 2096 |
SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { |
| 2097 |
switch (tp->t_state) { |
| 2098 |
|
| 2099 |
case TCPS_SYN_RECEIVED: |
| 2100 |
so->so_error = ECONNREFUSED; |
| 2101 |
goto close; |
| 2102 |
|
| 2103 |
case TCPS_ESTABLISHED: |
| 2104 |
if (V_tcp_insecure_rst == 0 && |
| 2105 |
!(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && |
| 2106 |
SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && |
| 2107 |
!(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && |
| 2108 |
SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { |
| 2109 |
TCPSTAT_INC(tcps_badrst); |
| 2110 |
goto drop; |
| 2111 |
} |
| 2112 |
/* FALLTHROUGH */ |
| 2113 |
case TCPS_FIN_WAIT_1: |
| 2114 |
case TCPS_FIN_WAIT_2: |
| 2115 |
case TCPS_CLOSE_WAIT: |
| 2116 |
so->so_error = ECONNRESET; |
| 2117 |
close: |
| 2118 |
KASSERT(ti_locked == TI_WLOCKED, |
| 2119 |
("tcp_do_segment: TH_RST 1 ti_locked %d", |
| 2120 |
ti_locked)); |
| 2121 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2122 |
|
| 2123 |
tp->t_state = TCPS_CLOSED; |
| 2124 |
TCPSTAT_INC(tcps_drops); |
| 2125 |
tp = tcp_close(tp); |
| 2126 |
break; |
| 2127 |
|
| 2128 |
case TCPS_CLOSING: |
| 2129 |
case TCPS_LAST_ACK: |
| 2130 |
KASSERT(ti_locked == TI_WLOCKED, |
| 2131 |
("tcp_do_segment: TH_RST 2 ti_locked %d", |
| 2132 |
ti_locked)); |
| 2133 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2134 |
|
| 2135 |
tp = tcp_close(tp); |
| 2136 |
break; |
| 2137 |
} |
| 2138 |
} |
| 2139 |
goto drop; |
| 2140 |
} |
| 2141 |
|
| 2142 |
/* |
| 2143 |
* RFC 1323 PAWS: If we have a timestamp reply on this segment |
| 2144 |
* and it's less than ts_recent, drop it. |
| 2145 |
*/ |
| 2146 |
if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && |
| 2147 |
TSTMP_LT(to.to_tsval, tp->ts_recent)) { |
| 2148 |
|
| 2149 |
/* Check to see if ts_recent is over 24 days old. */ |
| 2150 |
if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { |
| 2151 |
/* |
| 2152 |
* Invalidate ts_recent. If this segment updates |
| 2153 |
* ts_recent, the age will be reset later and ts_recent |
| 2154 |
* will get a valid value. If it does not, setting |
| 2155 |
* ts_recent to zero will at least satisfy the |
| 2156 |
* requirement that zero be placed in the timestamp |
| 2157 |
* echo reply when ts_recent isn't valid. The |
| 2158 |
* age isn't reset until we get a valid ts_recent |
| 2159 |
* because we don't want out-of-order segments to be |
| 2160 |
* dropped when ts_recent is old. |
| 2161 |
*/ |
| 2162 |
tp->ts_recent = 0; |
| 2163 |
} else { |
| 2164 |
TCPSTAT_INC(tcps_rcvduppack); |
| 2165 |
TCPSTAT_ADD(tcps_rcvdupbyte, tlen); |
| 2166 |
TCPSTAT_INC(tcps_pawsdrop); |
| 2167 |
if (tlen) |
| 2168 |
goto dropafterack; |
| 2169 |
goto drop; |
| 2170 |
} |
| 2171 |
} |
| 2172 |
|
| 2173 |
/* |
| 2174 |
* In the SYN-RECEIVED state, validate that the packet belongs to |
| 2175 |
* this connection before trimming the data to fit the receive |
| 2176 |
* window. Check the sequence number versus IRS since we know |
| 2177 |
* the sequence numbers haven't wrapped. This is a partial fix |
| 2178 |
* for the "LAND" DoS attack. |
| 2179 |
*/ |
| 2180 |
if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { |
| 2181 |
rstreason = BANDLIM_RST_OPENPORT; |
| 2182 |
goto dropwithreset; |
| 2183 |
} |
| 2184 |
|
| 2185 |
todrop = tp->rcv_nxt - th->th_seq; |
| 2186 |
if (todrop > 0) { |
| 2187 |
if (thflags & TH_SYN) { |
| 2188 |
thflags &= ~TH_SYN; |
| 2189 |
th->th_seq++; |
| 2190 |
if (th->th_urp > 1) |
| 2191 |
th->th_urp--; |
| 2192 |
else |
| 2193 |
thflags &= ~TH_URG; |
| 2194 |
todrop--; |
| 2195 |
} |
| 2196 |
/* |
| 2197 |
* Following if statement from Stevens, vol. 2, p. 960. |
| 2198 |
*/ |
| 2199 |
if (todrop > tlen |
| 2200 |
|| (todrop == tlen && (thflags & TH_FIN) == 0)) { |
| 2201 |
/* |
| 2202 |
* Any valid FIN must be to the left of the window. |
| 2203 |
* At this point the FIN must be a duplicate or out |
| 2204 |
* of sequence; drop it. |
| 2205 |
*/ |
| 2206 |
thflags &= ~TH_FIN; |
| 2207 |
|
| 2208 |
/* |
| 2209 |
* Send an ACK to resynchronize and drop any data. |
| 2210 |
* But keep on processing for RST or ACK. |
| 2211 |
*/ |
| 2212 |
tp->t_flags |= TF_ACKNOW; |
| 2213 |
todrop = tlen; |
| 2214 |
TCPSTAT_INC(tcps_rcvduppack); |
| 2215 |
TCPSTAT_ADD(tcps_rcvdupbyte, todrop); |
| 2216 |
} else { |
| 2217 |
TCPSTAT_INC(tcps_rcvpartduppack); |
| 2218 |
TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); |
| 2219 |
} |
| 2220 |
drop_hdrlen += todrop; /* drop from the top afterwards */ |
| 2221 |
th->th_seq += todrop; |
| 2222 |
tlen -= todrop; |
| 2223 |
if (th->th_urp > todrop) |
| 2224 |
th->th_urp -= todrop; |
| 2225 |
else { |
| 2226 |
thflags &= ~TH_URG; |
| 2227 |
th->th_urp = 0; |
| 2228 |
} |
| 2229 |
} |
| 2230 |
|
| 2231 |
/* |
| 2232 |
* If new data are received on a connection after the |
| 2233 |
* user processes are gone, then RST the other end. |
| 2234 |
*/ |
| 2235 |
if ((so->so_state & SS_NOFDREF) && |
| 2236 |
tp->t_state > TCPS_CLOSE_WAIT && tlen) { |
| 2237 |
char *s; |
| 2238 |
|
| 2239 |
KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " |
| 2240 |
"CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); |
| 2241 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2242 |
|
| 2243 |
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { |
| 2244 |
log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " |
| 2245 |
"was closed, sending RST and removing tcpcb\n", |
| 2246 |
s, __func__, tcpstates[tp->t_state], tlen); |
| 2247 |
free(s, M_TCPLOG); |
| 2248 |
} |
| 2249 |
tp = tcp_close(tp); |
| 2250 |
TCPSTAT_INC(tcps_rcvafterclose); |
| 2251 |
rstreason = BANDLIM_UNLIMITED; |
| 2252 |
goto dropwithreset; |
| 2253 |
} |
| 2254 |
|
| 2255 |
/* |
| 2256 |
* If segment ends after window, drop trailing data |
| 2257 |
* (and PUSH and FIN); if nothing left, just ACK. |
| 2258 |
*/ |
| 2259 |
todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); |
| 2260 |
if (todrop > 0) { |
| 2261 |
TCPSTAT_INC(tcps_rcvpackafterwin); |
| 2262 |
if (todrop >= tlen) { |
| 2263 |
TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); |
| 2264 |
/* |
| 2265 |
* If window is closed can only take segments at |
| 2266 |
* window edge, and have to drop data and PUSH from |
| 2267 |
* incoming segments. Continue processing, but |
| 2268 |
* remember to ack. Otherwise, drop segment |
| 2269 |
* and ack. |
| 2270 |
*/ |
| 2271 |
if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { |
| 2272 |
tp->t_flags |= TF_ACKNOW; |
| 2273 |
TCPSTAT_INC(tcps_rcvwinprobe); |
| 2274 |
} else |
| 2275 |
goto dropafterack; |
| 2276 |
} else |
| 2277 |
TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); |
| 2278 |
m_adj(m, -todrop); |
| 2279 |
tlen -= todrop; |
| 2280 |
thflags &= ~(TH_PUSH|TH_FIN); |
| 2281 |
} |
| 2282 |
|
| 2283 |
/* |
| 2284 |
* If last ACK falls within this segment's sequence numbers, |
| 2285 |
* record its timestamp. |
| 2286 |
* NOTE: |
| 2287 |
* 1) That the test incorporates suggestions from the latest |
| 2288 |
* proposal of the tcplw@cray.com list (Braden 1993/04/26). |
| 2289 |
* 2) That updating only on newer timestamps interferes with |
| 2290 |
* our earlier PAWS tests, so this check should be solely |
| 2291 |
* predicated on the sequence space of this segment. |
| 2292 |
* 3) That we modify the segment boundary check to be |
| 2293 |
* Last.ACK.Sent <= SEG.SEQ + SEG.Len |
| 2294 |
* instead of RFC1323's |
| 2295 |
* Last.ACK.Sent < SEG.SEQ + SEG.Len, |
| 2296 |
* This modified check allows us to overcome RFC1323's |
| 2297 |
* limitations as described in Stevens TCP/IP Illustrated |
| 2298 |
* Vol. 2 p.869. In such cases, we can still calculate the |
| 2299 |
* RTT correctly when RCV.NXT == Last.ACK.Sent. |
| 2300 |
*/ |
| 2301 |
if ((to.to_flags & TOF_TS) != 0 && |
| 2302 |
SEQ_LEQ(th->th_seq, tp->last_ack_sent) && |
| 2303 |
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + |
| 2304 |
((thflags & (TH_SYN|TH_FIN)) != 0))) { |
| 2305 |
tp->ts_recent_age = tcp_ts_getticks(); |
| 2306 |
tp->ts_recent = to.to_tsval; |
| 2307 |
} |
| 2308 |
|
| 2309 |
/* |
| 2310 |
* If a SYN is in the window, then this is an |
| 2311 |
* error and we send an RST and drop the connection. |
| 2312 |
*/ |
| 2313 |
if (thflags & TH_SYN) { |
| 2314 |
KASSERT(ti_locked == TI_WLOCKED, |
| 2315 |
("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); |
| 2316 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2317 |
|
| 2318 |
tp = tcp_drop(tp, ECONNRESET); |
| 2319 |
rstreason = BANDLIM_UNLIMITED; |
| 2320 |
goto drop; |
| 2321 |
} |
| 2322 |
|
| 2323 |
/* |
| 2324 |
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN |
| 2325 |
* flag is on (half-synchronized state), then queue data for |
| 2326 |
* later processing; else drop segment and return. |
| 2327 |
*/ |
| 2328 |
if ((thflags & TH_ACK) == 0) { |
| 2329 |
if (tp->t_state == TCPS_SYN_RECEIVED || |
| 2330 |
(tp->t_flags & TF_NEEDSYN)) |
| 2331 |
goto step6; |
| 2332 |
else if (tp->t_flags & TF_ACKNOW) |
| 2333 |
goto dropafterack; |
| 2334 |
else |
| 2335 |
goto drop; |
| 2336 |
} |
| 2337 |
|
| 2338 |
/* |
| 2339 |
* Ack processing. |
| 2340 |
*/ |
| 2341 |
switch (tp->t_state) { |
| 2342 |
|
| 2343 |
/* |
| 2344 |
* In SYN_RECEIVED state, the ack ACKs our SYN, so enter |
| 2345 |
* ESTABLISHED state and continue processing. |
| 2346 |
* The ACK was checked above. |
| 2347 |
*/ |
| 2348 |
case TCPS_SYN_RECEIVED: |
| 2349 |
|
| 2350 |
TCPSTAT_INC(tcps_connects); |
| 2351 |
soisconnected(so); |
| 2352 |
/* Do window scaling? */ |
| 2353 |
if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 2354 |
(TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 2355 |
tp->rcv_scale = tp->request_r_scale; |
| 2356 |
tp->snd_wnd = tiwin; |
| 2357 |
} |
| 2358 |
/* |
| 2359 |
* Make transitions: |
| 2360 |
* SYN-RECEIVED -> ESTABLISHED |
| 2361 |
* SYN-RECEIVED* -> FIN-WAIT-1 |
| 2362 |
*/ |
| 2363 |
tp->t_starttime = ticks; |
| 2364 |
if (tp->t_flags & TF_NEEDFIN) { |
| 2365 |
tp->t_state = TCPS_FIN_WAIT_1; |
| 2366 |
tp->t_flags &= ~TF_NEEDFIN; |
| 2367 |
} else { |
| 2368 |
tp->t_state = TCPS_ESTABLISHED; |
| 2369 |
cc_conn_init(tp); |
| 2370 |
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); |
| 2371 |
} |
| 2372 |
/* |
| 2373 |
* If segment contains data or ACK, will call tcp_reass() |
| 2374 |
* later; if not, do so now to pass queued data to user. |
| 2375 |
*/ |
| 2376 |
if (tlen == 0 && (thflags & TH_FIN) == 0) |
| 2377 |
(void) tcp_reass(tp, (struct tcphdr *)0, 0, |
| 2378 |
(struct mbuf *)0); |
| 2379 |
tp->snd_wl1 = th->th_seq - 1; |
| 2380 |
/* FALLTHROUGH */ |
| 2381 |
|
| 2382 |
/* |
| 2383 |
* In ESTABLISHED state: drop duplicate ACKs; ACK out of range |
| 2384 |
* ACKs. If the ack is in the range |
| 2385 |
* tp->snd_una < th->th_ack <= tp->snd_max |
| 2386 |
* then advance tp->snd_una to th->th_ack and drop |
| 2387 |
* data from the retransmission queue. If this ACK reflects |
| 2388 |
* more up to date window information we update our window information. |
| 2389 |
*/ |
| 2390 |
case TCPS_ESTABLISHED: |
| 2391 |
case TCPS_FIN_WAIT_1: |
| 2392 |
case TCPS_FIN_WAIT_2: |
| 2393 |
case TCPS_CLOSE_WAIT: |
| 2394 |
case TCPS_CLOSING: |
| 2395 |
case TCPS_LAST_ACK: |
| 2396 |
if (SEQ_GT(th->th_ack, tp->snd_max)) { |
| 2397 |
TCPSTAT_INC(tcps_rcvacktoomuch); |
| 2398 |
goto dropafterack; |
| 2399 |
} |
| 2400 |
if ((tp->t_flags & TF_SACK_PERMIT) && |
| 2401 |
((to.to_flags & TOF_SACK) || |
| 2402 |
!TAILQ_EMPTY(&tp->snd_holes))) |
| 2403 |
tcp_sack_doack(tp, &to, th->th_ack); |
| 2404 |
|
| 2405 |
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ |
| 2406 |
hhook_run_tcp_est_in(tp, th, &to); |
| 2407 |
|
| 2408 |
if (SEQ_LEQ(th->th_ack, tp->snd_una)) { |
| 2409 |
if (tlen == 0 && tiwin == tp->snd_wnd) { |
| 2410 |
TCPSTAT_INC(tcps_rcvdupack); |
| 2411 |
/* |
| 2412 |
* If we have outstanding data (other than |
| 2413 |
* a window probe), this is a completely |
| 2414 |
* duplicate ack (ie, window info didn't |
| 2415 |
* change), the ack is the biggest we've |
| 2416 |
* seen and we've seen exactly our rexmt |
| 2417 |
* threshhold of them, assume a packet |
| 2418 |
* has been dropped and retransmit it. |
| 2419 |
* Kludge snd_nxt & the congestion |
| 2420 |
* window so we send only this one |
| 2421 |
* packet. |
| 2422 |
* |
| 2423 |
* We know we're losing at the current |
| 2424 |
* window size so do congestion avoidance |
| 2425 |
* (set ssthresh to half the current window |
| 2426 |
* and pull our congestion window back to |
| 2427 |
* the new ssthresh). |
| 2428 |
* |
| 2429 |
* Dup acks mean that packets have left the |
| 2430 |
* network (they're now cached at the receiver) |
| 2431 |
* so bump cwnd by the amount in the receiver |
| 2432 |
* to keep a constant cwnd packets in the |
| 2433 |
* network. |
| 2434 |
* |
| 2435 |
* When using TCP ECN, notify the peer that |
| 2436 |
* we reduced the cwnd. |
| 2437 |
*/ |
| 2438 |
if (!tcp_timer_active(tp, TT_REXMT) || |
| 2439 |
th->th_ack != tp->snd_una) |
| 2440 |
tp->t_dupacks = 0; |
| 2441 |
else if (++tp->t_dupacks > tcprexmtthresh || |
| 2442 |
IN_FASTRECOVERY(tp->t_flags)) { |
| 2443 |
cc_ack_received(tp, th, CC_DUPACK); |
| 2444 |
if ((tp->t_flags & TF_SACK_PERMIT) && |
| 2445 |
IN_FASTRECOVERY(tp->t_flags)) { |
| 2446 |
int awnd; |
| 2447 |
|
| 2448 |
/* |
| 2449 |
* Compute the amount of data in flight first. |
| 2450 |
* We can inject new data into the pipe iff |
| 2451 |
* we have less than 1/2 the original window's |
| 2452 |
* worth of data in flight. |
| 2453 |
*/ |
| 2454 |
awnd = (tp->snd_nxt - tp->snd_fack) + |
| 2455 |
tp->sackhint.sack_bytes_rexmit; |
| 2456 |
if (awnd < tp->snd_ssthresh) { |
| 2457 |
tp->snd_cwnd += tp->t_maxseg; |
| 2458 |
if (tp->snd_cwnd > tp->snd_ssthresh) |
| 2459 |
tp->snd_cwnd = tp->snd_ssthresh; |
| 2460 |
} |
| 2461 |
} else |
| 2462 |
tp->snd_cwnd += tp->t_maxseg; |
| 2463 |
if ((thflags & TH_FIN) && |
| 2464 |
(TCPS_HAVERCVDFIN(tp->t_state) == 0)) { |
| 2465 |
/* |
| 2466 |
* If its a fin we need to process |
| 2467 |
* it to avoid a race where both |
| 2468 |
* sides enter FIN-WAIT and send FIN|ACK |
| 2469 |
* at the same time. |
| 2470 |
*/ |
| 2471 |
break; |
| 2472 |
} |
| 2473 |
(void) tcp_output(tp); |
| 2474 |
goto drop; |
| 2475 |
} else if (tp->t_dupacks == tcprexmtthresh) { |
| 2476 |
tcp_seq onxt = tp->snd_nxt; |
| 2477 |
|
| 2478 |
/* |
| 2479 |
* If we're doing sack, check to |
| 2480 |
* see if we're already in sack |
| 2481 |
* recovery. If we're not doing sack, |
| 2482 |
* check to see if we're in newreno |
| 2483 |
* recovery. |
| 2484 |
*/ |
| 2485 |
if (tp->t_flags & TF_SACK_PERMIT) { |
| 2486 |
if (IN_FASTRECOVERY(tp->t_flags)) { |
| 2487 |
tp->t_dupacks = 0; |
| 2488 |
break; |
| 2489 |
} |
| 2490 |
} else { |
| 2491 |
if (SEQ_LEQ(th->th_ack, |
| 2492 |
tp->snd_recover)) { |
| 2493 |
tp->t_dupacks = 0; |
| 2494 |
break; |
| 2495 |
} |
| 2496 |
} |
| 2497 |
/* Congestion signal before ack. */ |
| 2498 |
cc_cong_signal(tp, th, CC_NDUPACK); |
| 2499 |
cc_ack_received(tp, th, CC_DUPACK); |
| 2500 |
tcp_timer_activate(tp, TT_REXMT, 0); |
| 2501 |
tp->t_rtttime = 0; |
| 2502 |
if (tp->t_flags & TF_SACK_PERMIT) { |
| 2503 |
TCPSTAT_INC( |
| 2504 |
tcps_sack_recovery_episode); |
| 2505 |
tp->sack_newdata = tp->snd_nxt; |
| 2506 |
tp->snd_cwnd = tp->t_maxseg; |
| 2507 |
(void) tcp_output(tp); |
| 2508 |
goto drop; |
| 2509 |
} |
| 2510 |
tp->snd_nxt = th->th_ack; |
| 2511 |
tp->snd_cwnd = tp->t_maxseg; |
| 2512 |
if ((thflags & TH_FIN) && |
| 2513 |
(TCPS_HAVERCVDFIN(tp->t_state) == 0)) { |
| 2514 |
/* |
| 2515 |
* If its a fin we need to process |
| 2516 |
* it to avoid a race where both |
| 2517 |
* sides enter FIN-WAIT and send FIN|ACK |
| 2518 |
* at the same time. |
| 2519 |
*/ |
| 2520 |
break; |
| 2521 |
} |
| 2522 |
(void) tcp_output(tp); |
| 2523 |
KASSERT(tp->snd_limited <= 2, |
| 2524 |
("%s: tp->snd_limited too big", |
| 2525 |
__func__)); |
| 2526 |
tp->snd_cwnd = tp->snd_ssthresh + |
| 2527 |
tp->t_maxseg * |
| 2528 |
(tp->t_dupacks - tp->snd_limited); |
| 2529 |
if (SEQ_GT(onxt, tp->snd_nxt)) |
| 2530 |
tp->snd_nxt = onxt; |
| 2531 |
goto drop; |
| 2532 |
} else if (V_tcp_do_rfc3042) { |
| 2533 |
cc_ack_received(tp, th, CC_DUPACK); |
| 2534 |
u_long oldcwnd = tp->snd_cwnd; |
| 2535 |
tcp_seq oldsndmax = tp->snd_max; |
| 2536 |
u_int sent; |
| 2537 |
int avail; |
| 2538 |
|
| 2539 |
KASSERT(tp->t_dupacks == 1 || |
| 2540 |
tp->t_dupacks == 2, |
| 2541 |
("%s: dupacks not 1 or 2", |
| 2542 |
__func__)); |
| 2543 |
if (tp->t_dupacks == 1) |
| 2544 |
tp->snd_limited = 0; |
| 2545 |
tp->snd_cwnd = |
| 2546 |
(tp->snd_nxt - tp->snd_una) + |
| 2547 |
(tp->t_dupacks - tp->snd_limited) * |
| 2548 |
tp->t_maxseg; |
| 2549 |
if ((thflags & TH_FIN) && |
| 2550 |
(TCPS_HAVERCVDFIN(tp->t_state) == 0)) { |
| 2551 |
/* |
| 2552 |
* If its a fin we need to process |
| 2553 |
* it to avoid a race where both |
| 2554 |
* sides enter FIN-WAIT and send FIN|ACK |
| 2555 |
* at the same time. |
| 2556 |
*/ |
| 2557 |
break; |
| 2558 |
} |
| 2559 |
/* |
| 2560 |
* Only call tcp_output when there |
| 2561 |
* is new data available to be sent. |
| 2562 |
* Otherwise we would send pure ACKs. |
| 2563 |
*/ |
| 2564 |
SOCKBUF_LOCK(&so->so_snd); |
| 2565 |
avail = so->so_snd.sb_cc - |
| 2566 |
(tp->snd_nxt - tp->snd_una); |
| 2567 |
SOCKBUF_UNLOCK(&so->so_snd); |
| 2568 |
if (avail > 0) |
| 2569 |
(void) tcp_output(tp); |
| 2570 |
sent = tp->snd_max - oldsndmax; |
| 2571 |
if (sent > tp->t_maxseg) { |
| 2572 |
KASSERT((tp->t_dupacks == 2 && |
| 2573 |
tp->snd_limited == 0) || |
| 2574 |
(sent == tp->t_maxseg + 1 && |
| 2575 |
tp->t_flags & TF_SENTFIN), |
| 2576 |
("%s: sent too much", |
| 2577 |
__func__)); |
| 2578 |
tp->snd_limited = 2; |
| 2579 |
} else if (sent > 0) |
| 2580 |
++tp->snd_limited; |
| 2581 |
tp->snd_cwnd = oldcwnd; |
| 2582 |
goto drop; |
| 2583 |
} |
| 2584 |
} else |
| 2585 |
tp->t_dupacks = 0; |
| 2586 |
break; |
| 2587 |
} |
| 2588 |
|
| 2589 |
KASSERT(SEQ_GT(th->th_ack, tp->snd_una), |
| 2590 |
("%s: th_ack <= snd_una", __func__)); |
| 2591 |
|
| 2592 |
/* |
| 2593 |
* If the congestion window was inflated to account |
| 2594 |
* for the other side's cached packets, retract it. |
| 2595 |
*/ |
| 2596 |
if (IN_FASTRECOVERY(tp->t_flags)) { |
| 2597 |
if (SEQ_LT(th->th_ack, tp->snd_recover)) { |
| 2598 |
if (tp->t_flags & TF_SACK_PERMIT) |
| 2599 |
tcp_sack_partialack(tp, th); |
| 2600 |
else |
| 2601 |
tcp_newreno_partial_ack(tp, th); |
| 2602 |
} else |
| 2603 |
cc_post_recovery(tp, th); |
| 2604 |
} |
| 2605 |
tp->t_dupacks = 0; |
| 2606 |
/* |
| 2607 |
* If we reach this point, ACK is not a duplicate, |
| 2608 |
* i.e., it ACKs something we sent. |
| 2609 |
*/ |
| 2610 |
if (tp->t_flags & TF_NEEDSYN) { |
| 2611 |
/* |
| 2612 |
* T/TCP: Connection was half-synchronized, and our |
| 2613 |
* SYN has been ACK'd (so connection is now fully |
| 2614 |
* synchronized). Go to non-starred state, |
| 2615 |
* increment snd_una for ACK of SYN, and check if |
| 2616 |
* we can do window scaling. |
| 2617 |
*/ |
| 2618 |
tp->t_flags &= ~TF_NEEDSYN; |
| 2619 |
tp->snd_una++; |
| 2620 |
/* Do window scaling? */ |
| 2621 |
if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 2622 |
(TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 2623 |
tp->rcv_scale = tp->request_r_scale; |
| 2624 |
/* Send window already scaled. */ |
| 2625 |
} |
| 2626 |
} |
| 2627 |
|
| 2628 |
process_ACK: |
| 2629 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 2630 |
|
| 2631 |
acked = BYTES_THIS_ACK(tp, th); |
| 2632 |
KASSERT(acked >= 0, ("%s: acked unexepectedly negative " |
| 2633 |
"(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, |
| 2634 |
tp->snd_una, th->th_ack, tp, m)); |
| 2635 |
TCPSTAT_INC(tcps_rcvackpack); |
| 2636 |
TCPSTAT_ADD(tcps_rcvackbyte, acked); |
| 2637 |
|
| 2638 |
/* |
| 2639 |
* If we just performed our first retransmit, and the ACK |
| 2640 |
* arrives within our recovery window, then it was a mistake |
| 2641 |
* to do the retransmit in the first place. Recover our |
| 2642 |
* original cwnd and ssthresh, and proceed to transmit where |
| 2643 |
* we left off. |
| 2644 |
*/ |
| 2645 |
if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && |
| 2646 |
(int)(ticks - tp->t_badrxtwin) < 0) |
| 2647 |
cc_cong_signal(tp, th, CC_RTO_ERR); |
| 2648 |
|
| 2649 |
/* |
| 2650 |
* If we have a timestamp reply, update smoothed |
| 2651 |
* round trip time. If no timestamp is present but |
| 2652 |
* transmit timer is running and timed sequence |
| 2653 |
* number was acked, update smoothed round trip time. |
| 2654 |
* Since we now have an rtt measurement, cancel the |
| 2655 |
* timer backoff (cf., Phil Karn's retransmit alg.). |
| 2656 |
* Recompute the initial retransmit timer. |
| 2657 |
* |
| 2658 |
* Some boxes send broken timestamp replies |
| 2659 |
* during the SYN+ACK phase, ignore |
| 2660 |
* timestamps of 0 or we could calculate a |
| 2661 |
* huge RTT and blow up the retransmit timer. |
| 2662 |
*/ |
| 2663 |
if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { |
| 2664 |
u_int t; |
| 2665 |
|
| 2666 |
t = tcp_ts_getticks() - to.to_tsecr; |
| 2667 |
if (!tp->t_rttlow || tp->t_rttlow > t) |
| 2668 |
tp->t_rttlow = t; |
| 2669 |
tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); |
| 2670 |
} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { |
| 2671 |
if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) |
| 2672 |
tp->t_rttlow = ticks - tp->t_rtttime; |
| 2673 |
tcp_xmit_timer(tp, ticks - tp->t_rtttime); |
| 2674 |
} |
| 2675 |
|
| 2676 |
/* |
| 2677 |
* If all outstanding data is acked, stop retransmit |
| 2678 |
* timer and remember to restart (more output or persist). |
| 2679 |
* If there is more data to be acked, restart retransmit |
| 2680 |
* timer, using current (possibly backed-off) value. |
| 2681 |
*/ |
| 2682 |
if (th->th_ack == tp->snd_max) { |
| 2683 |
tcp_timer_activate(tp, TT_REXMT, 0); |
| 2684 |
needoutput = 1; |
| 2685 |
} else if (!tcp_timer_active(tp, TT_PERSIST)) |
| 2686 |
tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); |
| 2687 |
|
| 2688 |
/* |
| 2689 |
* If no data (only SYN) was ACK'd, |
| 2690 |
* skip rest of ACK processing. |
| 2691 |
*/ |
| 2692 |
if (acked == 0) |
| 2693 |
goto step6; |
| 2694 |
|
| 2695 |
/* |
| 2696 |
* Let the congestion control algorithm update congestion |
| 2697 |
* control related information. This typically means increasing |
| 2698 |
* the congestion window. |
| 2699 |
*/ |
| 2700 |
cc_ack_received(tp, th, CC_ACK); |
| 2701 |
|
| 2702 |
SOCKBUF_LOCK(&so->so_snd); |
| 2703 |
if (acked > so->so_snd.sb_cc) { |
| 2704 |
if (tp->snd_wnd >= so->so_snd.sb_cc) |
| 2705 |
tp->snd_wnd -= so->so_snd.sb_cc; |
| 2706 |
else |
| 2707 |
tp->snd_wnd = 0; |
| 2708 |
sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); |
| 2709 |
ourfinisacked = 1; |
| 2710 |
} else { |
| 2711 |
sbdrop_locked(&so->so_snd, acked); |
| 2712 |
if (tp->snd_wnd >= (u_long) acked) |
| 2713 |
tp->snd_wnd -= acked; |
| 2714 |
else |
| 2715 |
tp->snd_wnd = 0; |
| 2716 |
ourfinisacked = 0; |
| 2717 |
} |
| 2718 |
/* NB: sowwakeup_locked() does an implicit unlock. */ |
| 2719 |
sowwakeup_locked(so); |
| 2720 |
/* Detect una wraparound. */ |
| 2721 |
if (!IN_RECOVERY(tp->t_flags) && |
| 2722 |
SEQ_GT(tp->snd_una, tp->snd_recover) && |
| 2723 |
SEQ_LEQ(th->th_ack, tp->snd_recover)) |
| 2724 |
tp->snd_recover = th->th_ack - 1; |
| 2725 |
/* XXXLAS: Can this be moved up into cc_post_recovery? */ |
| 2726 |
if (IN_RECOVERY(tp->t_flags) && |
| 2727 |
SEQ_GEQ(th->th_ack, tp->snd_recover)) { |
| 2728 |
EXIT_RECOVERY(tp->t_flags); |
| 2729 |
} |
| 2730 |
tp->snd_una = th->th_ack; |
| 2731 |
if (tp->t_flags & TF_SACK_PERMIT) { |
| 2732 |
if (SEQ_GT(tp->snd_una, tp->snd_recover)) |
| 2733 |
tp->snd_recover = tp->snd_una; |
| 2734 |
} |
| 2735 |
if (SEQ_LT(tp->snd_nxt, tp->snd_una)) |
| 2736 |
tp->snd_nxt = tp->snd_una; |
| 2737 |
|
| 2738 |
switch (tp->t_state) { |
| 2739 |
|
| 2740 |
/* |
| 2741 |
* In FIN_WAIT_1 STATE in addition to the processing |
| 2742 |
* for the ESTABLISHED state if our FIN is now acknowledged |
| 2743 |
* then enter FIN_WAIT_2. |
| 2744 |
*/ |
| 2745 |
case TCPS_FIN_WAIT_1: |
| 2746 |
if (ourfinisacked) { |
| 2747 |
/* |
| 2748 |
* If we can't receive any more |
| 2749 |
* data, then closing user can proceed. |
| 2750 |
* Starting the timer is contrary to the |
| 2751 |
* specification, but if we don't get a FIN |
| 2752 |
* we'll hang forever. |
| 2753 |
* |
| 2754 |
* XXXjl: |
| 2755 |
* we should release the tp also, and use a |
| 2756 |
* compressed state. |
| 2757 |
*/ |
| 2758 |
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { |
| 2759 |
soisdisconnected(so); |
| 2760 |
tcp_timer_activate(tp, TT_2MSL, |
| 2761 |
(tcp_fast_finwait2_recycle ? |
| 2762 |
tcp_finwait2_timeout : |
| 2763 |
TP_MAXIDLE(tp))); |
| 2764 |
} |
| 2765 |
tp->t_state = TCPS_FIN_WAIT_2; |
| 2766 |
} |
| 2767 |
break; |
| 2768 |
|
| 2769 |
/* |
| 2770 |
* In CLOSING STATE in addition to the processing for |
| 2771 |
* the ESTABLISHED state if the ACK acknowledges our FIN |
| 2772 |
* then enter the TIME-WAIT state, otherwise ignore |
| 2773 |
* the segment. |
| 2774 |
*/ |
| 2775 |
case TCPS_CLOSING: |
| 2776 |
if (ourfinisacked) { |
| 2777 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2778 |
tcp_twstart(tp); |
| 2779 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 2780 |
m_freem(m); |
| 2781 |
return; |
| 2782 |
} |
| 2783 |
break; |
| 2784 |
|
| 2785 |
/* |
| 2786 |
* In LAST_ACK, we may still be waiting for data to drain |
| 2787 |
* and/or to be acked, as well as for the ack of our FIN. |
| 2788 |
* If our FIN is now acknowledged, delete the TCB, |
| 2789 |
* enter the closed state and return. |
| 2790 |
*/ |
| 2791 |
case TCPS_LAST_ACK: |
| 2792 |
if (ourfinisacked) { |
| 2793 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 2794 |
tp = tcp_close(tp); |
| 2795 |
goto drop; |
| 2796 |
} |
| 2797 |
break; |
| 2798 |
} |
| 2799 |
} |
| 2800 |
|
| 2801 |
step6: |
| 2802 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 2803 |
|
| 2804 |
/* |
| 2805 |
* Update window information. |
| 2806 |
* Don't look at window if no ACK: TAC's send garbage on first SYN. |
| 2807 |
*/ |
| 2808 |
if ((thflags & TH_ACK) && |
| 2809 |
(SEQ_LT(tp->snd_wl1, th->th_seq) || |
| 2810 |
(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || |
| 2811 |
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { |
| 2812 |
/* keep track of pure window updates */ |
| 2813 |
if (tlen == 0 && |
| 2814 |
tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) |
| 2815 |
TCPSTAT_INC(tcps_rcvwinupd); |
| 2816 |
tp->snd_wnd = tiwin; |
| 2817 |
tp->snd_wl1 = th->th_seq; |
| 2818 |
tp->snd_wl2 = th->th_ack; |
| 2819 |
if (tp->snd_wnd > tp->max_sndwnd) |
| 2820 |
tp->max_sndwnd = tp->snd_wnd; |
| 2821 |
needoutput = 1; |
| 2822 |
} |
| 2823 |
|
| 2824 |
/* |
| 2825 |
* Process segments with URG. |
| 2826 |
*/ |
| 2827 |
if ((thflags & TH_URG) && th->th_urp && |
| 2828 |
TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2829 |
/* |
| 2830 |
* This is a kludge, but if we receive and accept |
| 2831 |
* random urgent pointers, we'll crash in |
| 2832 |
* soreceive. It's hard to imagine someone |
| 2833 |
* actually wanting to send this much urgent data. |
| 2834 |
*/ |
| 2835 |
SOCKBUF_LOCK(&so->so_rcv); |
| 2836 |
if (th->th_urp + so->so_rcv.sb_cc > sb_max) { |
| 2837 |
th->th_urp = 0; /* XXX */ |
| 2838 |
thflags &= ~TH_URG; /* XXX */ |
| 2839 |
SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ |
| 2840 |
goto dodata; /* XXX */ |
| 2841 |
} |
| 2842 |
/* |
| 2843 |
* If this segment advances the known urgent pointer, |
| 2844 |
* then mark the data stream. This should not happen |
| 2845 |
* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since |
| 2846 |
* a FIN has been received from the remote side. |
| 2847 |
* In these states we ignore the URG. |
| 2848 |
* |
| 2849 |
* According to RFC961 (Assigned Protocols), |
| 2850 |
* the urgent pointer points to the last octet |
| 2851 |
* of urgent data. We continue, however, |
| 2852 |
* to consider it to indicate the first octet |
| 2853 |
* of data past the urgent section as the original |
| 2854 |
* spec states (in one of two places). |
| 2855 |
*/ |
| 2856 |
if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { |
| 2857 |
tp->rcv_up = th->th_seq + th->th_urp; |
| 2858 |
so->so_oobmark = so->so_rcv.sb_cc + |
| 2859 |
(tp->rcv_up - tp->rcv_nxt) - 1; |
| 2860 |
if (so->so_oobmark == 0) |
| 2861 |
so->so_rcv.sb_state |= SBS_RCVATMARK; |
| 2862 |
sohasoutofband(so); |
| 2863 |
tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); |
| 2864 |
} |
| 2865 |
SOCKBUF_UNLOCK(&so->so_rcv); |
| 2866 |
/* |
| 2867 |
* Remove out of band data so doesn't get presented to user. |
| 2868 |
* This can happen independent of advancing the URG pointer, |
| 2869 |
* but if two URG's are pending at once, some out-of-band |
| 2870 |
* data may creep in... ick. |
| 2871 |
*/ |
| 2872 |
if (th->th_urp <= (u_long)tlen && |
| 2873 |
!(so->so_options & SO_OOBINLINE)) { |
| 2874 |
/* hdr drop is delayed */ |
| 2875 |
tcp_pulloutofband(so, th, m, drop_hdrlen); |
| 2876 |
} |
| 2877 |
} else { |
| 2878 |
/* |
| 2879 |
* If no out of band data is expected, |
| 2880 |
* pull receive urgent pointer along |
| 2881 |
* with the receive window. |
| 2882 |
*/ |
| 2883 |
if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) |
| 2884 |
tp->rcv_up = tp->rcv_nxt; |
| 2885 |
} |
| 2886 |
dodata: /* XXX */ |
| 2887 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 2888 |
|
| 2889 |
/* |
| 2890 |
* Process the segment text, merging it into the TCP sequencing queue, |
| 2891 |
* and arranging for acknowledgment of receipt if necessary. |
| 2892 |
* This process logically involves adjusting tp->rcv_wnd as data |
| 2893 |
* is presented to the user (this happens in tcp_usrreq.c, |
| 2894 |
* case PRU_RCVD). If a FIN has already been received on this |
| 2895 |
* connection then we just ignore the text. |
| 2896 |
*/ |
| 2897 |
if ((tlen || (thflags & TH_FIN)) && |
| 2898 |
TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2899 |
tcp_seq save_start = th->th_seq; |
| 2900 |
m_adj(m, drop_hdrlen); /* delayed header drop */ |
| 2901 |
/* |
| 2902 |
* Insert segment which includes th into TCP reassembly queue |
| 2903 |
* with control block tp. Set thflags to whether reassembly now |
| 2904 |
* includes a segment with FIN. This handles the common case |
| 2905 |
* inline (segment is the next to be received on an established |
| 2906 |
* connection, and the queue is empty), avoiding linkage into |
| 2907 |
* and removal from the queue and repetition of various |
| 2908 |
* conversions. |
| 2909 |
* Set DELACK for segments received in order, but ack |
| 2910 |
* immediately when segments are out of order (so |
| 2911 |
* fast retransmit can work). |
| 2912 |
*/ |
| 2913 |
if (th->th_seq == tp->rcv_nxt && |
| 2914 |
LIST_EMPTY(&tp->t_segq) && |
| 2915 |
TCPS_HAVEESTABLISHED(tp->t_state)) { |
| 2916 |
if (DELAY_ACK(tp, tlen)) |
| 2917 |
tp->t_flags |= TF_DELACK; |
| 2918 |
else |
| 2919 |
tp->t_flags |= TF_ACKNOW; |
| 2920 |
tp->rcv_nxt += tlen; |
| 2921 |
thflags = th->th_flags & TH_FIN; |
| 2922 |
TCPSTAT_INC(tcps_rcvpack); |
| 2923 |
TCPSTAT_ADD(tcps_rcvbyte, tlen); |
| 2924 |
ND6_HINT(tp); |
| 2925 |
SOCKBUF_LOCK(&so->so_rcv); |
| 2926 |
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) |
| 2927 |
m_freem(m); |
| 2928 |
else |
| 2929 |
sbappendstream_locked(&so->so_rcv, m); |
| 2930 |
/* NB: sorwakeup_locked() does an implicit unlock. */ |
| 2931 |
sorwakeup_locked(so); |
| 2932 |
} else { |
| 2933 |
/* |
| 2934 |
* XXX: Due to the header drop above "th" is |
| 2935 |
* theoretically invalid by now. Fortunately |
| 2936 |
* m_adj() doesn't actually frees any mbufs |
| 2937 |
* when trimming from the head. |
| 2938 |
*/ |
| 2939 |
thflags = tcp_reass(tp, th, &tlen, m); |
| 2940 |
tp->t_flags |= TF_ACKNOW; |
| 2941 |
} |
| 2942 |
if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) |
| 2943 |
tcp_update_sack_list(tp, save_start, save_start + tlen); |
| 2944 |
#if 0 |
| 2945 |
/* |
| 2946 |
* Note the amount of data that peer has sent into |
| 2947 |
* our window, in order to estimate the sender's |
| 2948 |
* buffer size. |
| 2949 |
* XXX: Unused. |
| 2950 |
*/ |
| 2951 |
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) |
| 2952 |
len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); |
| 2953 |
else |
| 2954 |
len = so->so_rcv.sb_hiwat; |
| 2955 |
#endif |
| 2956 |
} else { |
| 2957 |
m_freem(m); |
| 2958 |
thflags &= ~TH_FIN; |
| 2959 |
} |
| 2960 |
|
| 2961 |
/* |
| 2962 |
* If FIN is received ACK the FIN and let the user know |
| 2963 |
* that the connection is closing. |
| 2964 |
*/ |
| 2965 |
if (thflags & TH_FIN) { |
| 2966 |
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2967 |
socantrcvmore(so); |
| 2968 |
/* |
| 2969 |
* If connection is half-synchronized |
| 2970 |
* (ie NEEDSYN flag on) then delay ACK, |
| 2971 |
* so it may be piggybacked when SYN is sent. |
| 2972 |
* Otherwise, since we received a FIN then no |
| 2973 |
* more input can be expected, send ACK now. |
| 2974 |
*/ |
| 2975 |
if (tp->t_flags & TF_NEEDSYN) |
| 2976 |
tp->t_flags |= TF_DELACK; |
| 2977 |
else |
| 2978 |
tp->t_flags |= TF_ACKNOW; |
| 2979 |
tp->rcv_nxt++; |
| 2980 |
} |
| 2981 |
switch (tp->t_state) { |
| 2982 |
|
| 2983 |
/* |
| 2984 |
* In SYN_RECEIVED and ESTABLISHED STATES |
| 2985 |
* enter the CLOSE_WAIT state. |
| 2986 |
*/ |
| 2987 |
case TCPS_SYN_RECEIVED: |
| 2988 |
tp->t_starttime = ticks; |
| 2989 |
/* FALLTHROUGH */ |
| 2990 |
case TCPS_ESTABLISHED: |
| 2991 |
tp->t_state = TCPS_CLOSE_WAIT; |
| 2992 |
break; |
| 2993 |
|
| 2994 |
/* |
| 2995 |
* If still in FIN_WAIT_1 STATE FIN has not been acked so |
| 2996 |
* enter the CLOSING state. |
| 2997 |
*/ |
| 2998 |
case TCPS_FIN_WAIT_1: |
| 2999 |
tp->t_state = TCPS_CLOSING; |
| 3000 |
break; |
| 3001 |
|
| 3002 |
/* |
| 3003 |
* In FIN_WAIT_2 state enter the TIME_WAIT state, |
| 3004 |
* starting the time-wait timer, turning off the other |
| 3005 |
* standard timers. |
| 3006 |
*/ |
| 3007 |
case TCPS_FIN_WAIT_2: |
| 3008 |
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
| 3009 |
KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " |
| 3010 |
"TCP_FIN_WAIT_2 ti_locked: %d", __func__, |
| 3011 |
ti_locked)); |
| 3012 |
|
| 3013 |
tcp_twstart(tp); |
| 3014 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 3015 |
return; |
| 3016 |
} |
| 3017 |
} |
| 3018 |
if (ti_locked == TI_WLOCKED) |
| 3019 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 3020 |
ti_locked = TI_UNLOCKED; |
| 3021 |
|
| 3022 |
#ifdef TCPDEBUG |
| 3023 |
if (so->so_options & SO_DEBUG) |
| 3024 |
tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, |
| 3025 |
&tcp_savetcp, 0); |
| 3026 |
#endif |
| 3027 |
|
| 3028 |
/* |
| 3029 |
* Return any desired output. |
| 3030 |
*/ |
| 3031 |
if (needoutput || (tp->t_flags & TF_ACKNOW)) |
| 3032 |
(void) tcp_output(tp); |
| 3033 |
|
| 3034 |
check_delack: |
| 3035 |
KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", |
| 3036 |
__func__, ti_locked)); |
| 3037 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 3038 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3039 |
|
| 3040 |
if (tp->t_flags & TF_DELACK) { |
| 3041 |
tp->t_flags &= ~TF_DELACK; |
| 3042 |
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); |
| 3043 |
} |
| 3044 |
INP_WUNLOCK(tp->t_inpcb); |
| 3045 |
return; |
| 3046 |
|
| 3047 |
dropafterack: |
| 3048 |
/* |
| 3049 |
* Generate an ACK dropping incoming segment if it occupies |
| 3050 |
* sequence space, where the ACK reflects our state. |
| 3051 |
* |
| 3052 |
* We can now skip the test for the RST flag since all |
| 3053 |
* paths to this code happen after packets containing |
| 3054 |
* RST have been dropped. |
| 3055 |
* |
| 3056 |
* In the SYN-RECEIVED state, don't send an ACK unless the |
| 3057 |
* segment we received passes the SYN-RECEIVED ACK test. |
| 3058 |
* If it fails send a RST. This breaks the loop in the |
| 3059 |
* "LAND" DoS attack, and also prevents an ACK storm |
| 3060 |
* between two listening ports that have been sent forged |
| 3061 |
* SYN segments, each with the source address of the other. |
| 3062 |
*/ |
| 3063 |
if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && |
| 3064 |
(SEQ_GT(tp->snd_una, th->th_ack) || |
| 3065 |
SEQ_GT(th->th_ack, tp->snd_max)) ) { |
| 3066 |
rstreason = BANDLIM_RST_OPENPORT; |
| 3067 |
goto dropwithreset; |
| 3068 |
} |
| 3069 |
#ifdef TCPDEBUG |
| 3070 |
if (so->so_options & SO_DEBUG) |
| 3071 |
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, |
| 3072 |
&tcp_savetcp, 0); |
| 3073 |
#endif |
| 3074 |
if (ti_locked == TI_WLOCKED) |
| 3075 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 3076 |
ti_locked = TI_UNLOCKED; |
| 3077 |
|
| 3078 |
tp->t_flags |= TF_ACKNOW; |
| 3079 |
(void) tcp_output(tp); |
| 3080 |
INP_WUNLOCK(tp->t_inpcb); |
| 3081 |
m_freem(m); |
| 3082 |
return; |
| 3083 |
|
| 3084 |
dropwithreset: |
| 3085 |
if (ti_locked == TI_WLOCKED) |
| 3086 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 3087 |
ti_locked = TI_UNLOCKED; |
| 3088 |
|
| 3089 |
if (tp != NULL) { |
| 3090 |
tcp_dropwithreset(m, th, tp, tlen, rstreason); |
| 3091 |
INP_WUNLOCK(tp->t_inpcb); |
| 3092 |
} else |
| 3093 |
tcp_dropwithreset(m, th, NULL, tlen, rstreason); |
| 3094 |
return; |
| 3095 |
|
| 3096 |
drop: |
| 3097 |
if (ti_locked == TI_WLOCKED) { |
| 3098 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 3099 |
ti_locked = TI_UNLOCKED; |
| 3100 |
} |
| 3101 |
#ifdef INVARIANTS |
| 3102 |
else |
| 3103 |
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); |
| 3104 |
#endif |
| 3105 |
|
| 3106 |
/* |
| 3107 |
* Drop space held by incoming segment and return. |
| 3108 |
*/ |
| 3109 |
#ifdef TCPDEBUG |
| 3110 |
if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) |
| 3111 |
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, |
| 3112 |
&tcp_savetcp, 0); |
| 3113 |
#endif |
| 3114 |
if (tp != NULL) |
| 3115 |
INP_WUNLOCK(tp->t_inpcb); |
| 3116 |
m_freem(m); |
| 3117 |
} |
| 3118 |
|
| 3119 |
/* |
| 3120 |
* Issue RST and make ACK acceptable to originator of segment. |
| 3121 |
* The mbuf must still include the original packet header. |
| 3122 |
* tp may be NULL. |
| 3123 |
*/ |
| 3124 |
static void |
| 3125 |
tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, |
| 3126 |
int tlen, int rstreason) |
| 3127 |
{ |
| 3128 |
#ifdef INET |
| 3129 |
struct ip *ip; |
| 3130 |
#endif |
| 3131 |
#ifdef INET6 |
| 3132 |
struct ip6_hdr *ip6; |
| 3133 |
#endif |
| 3134 |
|
| 3135 |
if (tp != NULL) { |
| 3136 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3137 |
} |
| 3138 |
|
| 3139 |
/* Don't bother if destination was broadcast/multicast. */ |
| 3140 |
if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) |
| 3141 |
goto drop; |
| 3142 |
#ifdef INET6 |
| 3143 |
if (mtod(m, struct ip *)->ip_v == 6) { |
| 3144 |
ip6 = mtod(m, struct ip6_hdr *); |
| 3145 |
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || |
| 3146 |
IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) |
| 3147 |
goto drop; |
| 3148 |
/* IPv6 anycast check is done at tcp6_input() */ |
| 3149 |
} |
| 3150 |
#endif |
| 3151 |
#if defined(INET) && defined(INET6) |
| 3152 |
else |
| 3153 |
#endif |
| 3154 |
#ifdef INET |
| 3155 |
{ |
| 3156 |
ip = mtod(m, struct ip *); |
| 3157 |
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || |
| 3158 |
IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || |
| 3159 |
ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || |
| 3160 |
in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) |
| 3161 |
goto drop; |
| 3162 |
} |
| 3163 |
#endif |
| 3164 |
|
| 3165 |
/* Perform bandwidth limiting. */ |
| 3166 |
if (badport_bandlim(rstreason) < 0) |
| 3167 |
goto drop; |
| 3168 |
|
| 3169 |
/* tcp_respond consumes the mbuf chain. */ |
| 3170 |
if (th->th_flags & TH_ACK) { |
| 3171 |
tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, |
| 3172 |
th->th_ack, TH_RST); |
| 3173 |
} else { |
| 3174 |
if (th->th_flags & TH_SYN) |
| 3175 |
tlen++; |
| 3176 |
tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, |
| 3177 |
(tcp_seq)0, TH_RST|TH_ACK); |
| 3178 |
} |
| 3179 |
return; |
| 3180 |
drop: |
| 3181 |
m_freem(m); |
| 3182 |
} |
| 3183 |
|
| 3184 |
/* |
| 3185 |
* Parse TCP options and place in tcpopt. |
| 3186 |
*/ |
| 3187 |
static void |
| 3188 |
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) |
| 3189 |
{ |
| 3190 |
int opt, optlen; |
| 3191 |
|
| 3192 |
to->to_flags = 0; |
| 3193 |
for (; cnt > 0; cnt -= optlen, cp += optlen) { |
| 3194 |
opt = cp[0]; |
| 3195 |
if (opt == TCPOPT_EOL) |
| 3196 |
break; |
| 3197 |
if (opt == TCPOPT_NOP) |
| 3198 |
optlen = 1; |
| 3199 |
else { |
| 3200 |
if (cnt < 2) |
| 3201 |
break; |
| 3202 |
optlen = cp[1]; |
| 3203 |
if (optlen < 2 || optlen > cnt) |
| 3204 |
break; |
| 3205 |
} |
| 3206 |
switch (opt) { |
| 3207 |
case TCPOPT_MAXSEG: |
| 3208 |
if (optlen != TCPOLEN_MAXSEG) |
| 3209 |
continue; |
| 3210 |
if (!(flags & TO_SYN)) |
| 3211 |
continue; |
| 3212 |
to->to_flags |= TOF_MSS; |
| 3213 |
bcopy((char *)cp + 2, |
| 3214 |
(char *)&to->to_mss, sizeof(to->to_mss)); |
| 3215 |
to->to_mss = ntohs(to->to_mss); |
| 3216 |
break; |
| 3217 |
case TCPOPT_WINDOW: |
| 3218 |
if (optlen != TCPOLEN_WINDOW) |
| 3219 |
continue; |
| 3220 |
if (!(flags & TO_SYN)) |
| 3221 |
continue; |
| 3222 |
to->to_flags |= TOF_SCALE; |
| 3223 |
to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); |
| 3224 |
break; |
| 3225 |
case TCPOPT_TIMESTAMP: |
| 3226 |
if (optlen != TCPOLEN_TIMESTAMP) |
| 3227 |
continue; |
| 3228 |
to->to_flags |= TOF_TS; |
| 3229 |
bcopy((char *)cp + 2, |
| 3230 |
(char *)&to->to_tsval, sizeof(to->to_tsval)); |
| 3231 |
to->to_tsval = ntohl(to->to_tsval); |
| 3232 |
bcopy((char *)cp + 6, |
| 3233 |
(char *)&to->to_tsecr, sizeof(to->to_tsecr)); |
| 3234 |
to->to_tsecr = ntohl(to->to_tsecr); |
| 3235 |
break; |
| 3236 |
#ifdef TCP_SIGNATURE |
| 3237 |
/* |
| 3238 |
* XXX In order to reply to a host which has set the |
| 3239 |
* TCP_SIGNATURE option in its initial SYN, we have to |
| 3240 |
* record the fact that the option was observed here |
| 3241 |
* for the syncache code to perform the correct response. |
| 3242 |
*/ |
| 3243 |
case TCPOPT_SIGNATURE: |
| 3244 |
if (optlen != TCPOLEN_SIGNATURE) |
| 3245 |
continue; |
| 3246 |
to->to_flags |= TOF_SIGNATURE; |
| 3247 |
to->to_signature = cp + 2; |
| 3248 |
break; |
| 3249 |
#endif |
| 3250 |
case TCPOPT_SACK_PERMITTED: |
| 3251 |
if (optlen != TCPOLEN_SACK_PERMITTED) |
| 3252 |
continue; |
| 3253 |
if (!(flags & TO_SYN)) |
| 3254 |
continue; |
| 3255 |
if (!V_tcp_do_sack) |
| 3256 |
continue; |
| 3257 |
to->to_flags |= TOF_SACKPERM; |
| 3258 |
break; |
| 3259 |
case TCPOPT_SACK: |
| 3260 |
if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) |
| 3261 |
continue; |
| 3262 |
if (flags & TO_SYN) |
| 3263 |
continue; |
| 3264 |
to->to_flags |= TOF_SACK; |
| 3265 |
to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; |
| 3266 |
to->to_sacks = cp + 2; |
| 3267 |
TCPSTAT_INC(tcps_sack_rcv_blocks); |
| 3268 |
break; |
| 3269 |
default: |
| 3270 |
continue; |
| 3271 |
} |
| 3272 |
} |
| 3273 |
} |
| 3274 |
|
| 3275 |
/* |
| 3276 |
* Pull out of band byte out of a segment so |
| 3277 |
* it doesn't appear in the user's data queue. |
| 3278 |
* It is still reflected in the segment length for |
| 3279 |
* sequencing purposes. |
| 3280 |
*/ |
| 3281 |
static void |
| 3282 |
tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, |
| 3283 |
int off) |
| 3284 |
{ |
| 3285 |
int cnt = off + th->th_urp - 1; |
| 3286 |
|
| 3287 |
while (cnt >= 0) { |
| 3288 |
if (m->m_len > cnt) { |
| 3289 |
char *cp = mtod(m, caddr_t) + cnt; |
| 3290 |
struct tcpcb *tp = sototcpcb(so); |
| 3291 |
|
| 3292 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3293 |
|
| 3294 |
tp->t_iobc = *cp; |
| 3295 |
tp->t_oobflags |= TCPOOB_HAVEDATA; |
| 3296 |
bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); |
| 3297 |
m->m_len--; |
| 3298 |
if (m->m_flags & M_PKTHDR) |
| 3299 |
m->m_pkthdr.len--; |
| 3300 |
return; |
| 3301 |
} |
| 3302 |
cnt -= m->m_len; |
| 3303 |
m = m->m_next; |
| 3304 |
if (m == NULL) |
| 3305 |
break; |
| 3306 |
} |
| 3307 |
panic("tcp_pulloutofband"); |
| 3308 |
} |
| 3309 |
|
| 3310 |
/* |
| 3311 |
* Collect new round-trip time estimate |
| 3312 |
* and update averages and current timeout. |
| 3313 |
*/ |
| 3314 |
static void |
| 3315 |
tcp_xmit_timer(struct tcpcb *tp, int rtt) |
| 3316 |
{ |
| 3317 |
int delta; |
| 3318 |
|
| 3319 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3320 |
|
| 3321 |
TCPSTAT_INC(tcps_rttupdated); |
| 3322 |
tp->t_rttupdated++; |
| 3323 |
if (tp->t_srtt != 0) { |
| 3324 |
/* |
| 3325 |
* srtt is stored as fixed point with 5 bits after the |
| 3326 |
* binary point (i.e., scaled by 8). The following magic |
| 3327 |
* is equivalent to the smoothing algorithm in rfc793 with |
| 3328 |
* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed |
| 3329 |
* point). Adjust rtt to origin 0. |
| 3330 |
*/ |
| 3331 |
delta = ((rtt - 1) << TCP_DELTA_SHIFT) |
| 3332 |
- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); |
| 3333 |
|
| 3334 |
if ((tp->t_srtt += delta) <= 0) |
| 3335 |
tp->t_srtt = 1; |
| 3336 |
|
| 3337 |
/* |
| 3338 |
* We accumulate a smoothed rtt variance (actually, a |
| 3339 |
* smoothed mean difference), then set the retransmit |
| 3340 |
* timer to smoothed rtt + 4 times the smoothed variance. |
| 3341 |
* rttvar is stored as fixed point with 4 bits after the |
| 3342 |
* binary point (scaled by 16). The following is |
| 3343 |
* equivalent to rfc793 smoothing with an alpha of .75 |
| 3344 |
* (rttvar = rttvar*3/4 + |delta| / 4). This replaces |
| 3345 |
* rfc793's wired-in beta. |
| 3346 |
*/ |
| 3347 |
if (delta < 0) |
| 3348 |
delta = -delta; |
| 3349 |
delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); |
| 3350 |
if ((tp->t_rttvar += delta) <= 0) |
| 3351 |
tp->t_rttvar = 1; |
| 3352 |
if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) |
| 3353 |
tp->t_rttbest = tp->t_srtt + tp->t_rttvar; |
| 3354 |
} else { |
| 3355 |
/* |
| 3356 |
* No rtt measurement yet - use the unsmoothed rtt. |
| 3357 |
* Set the variance to half the rtt (so our first |
| 3358 |
* retransmit happens at 3*rtt). |
| 3359 |
*/ |
| 3360 |
tp->t_srtt = rtt << TCP_RTT_SHIFT; |
| 3361 |
tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); |
| 3362 |
tp->t_rttbest = tp->t_srtt + tp->t_rttvar; |
| 3363 |
} |
| 3364 |
tp->t_rtttime = 0; |
| 3365 |
tp->t_rxtshift = 0; |
| 3366 |
|
| 3367 |
/* |
| 3368 |
* the retransmit should happen at rtt + 4 * rttvar. |
| 3369 |
* Because of the way we do the smoothing, srtt and rttvar |
| 3370 |
* will each average +1/2 tick of bias. When we compute |
| 3371 |
* the retransmit timer, we want 1/2 tick of rounding and |
| 3372 |
* 1 extra tick because of +-1/2 tick uncertainty in the |
| 3373 |
* firing of the timer. The bias will give us exactly the |
| 3374 |
* 1.5 tick we need. But, because the bias is |
| 3375 |
* statistical, we have to test that we don't drop below |
| 3376 |
* the minimum feasible timer (which is 2 ticks). |
| 3377 |
*/ |
| 3378 |
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), |
| 3379 |
max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); |
| 3380 |
|
| 3381 |
/* |
| 3382 |
* We received an ack for a packet that wasn't retransmitted; |
| 3383 |
* it is probably safe to discard any error indications we've |
| 3384 |
* received recently. This isn't quite right, but close enough |
| 3385 |
* for now (a route might have failed after we sent a segment, |
| 3386 |
* and the return path might not be symmetrical). |
| 3387 |
*/ |
| 3388 |
tp->t_softerror = 0; |
| 3389 |
} |
| 3390 |
|
| 3391 |
/* |
| 3392 |
* Determine a reasonable value for maxseg size. |
| 3393 |
* If the route is known, check route for mtu. |
| 3394 |
* If none, use an mss that can be handled on the outgoing interface |
| 3395 |
* without forcing IP to fragment. If no route is found, route has no mtu, |
| 3396 |
* or the destination isn't local, use a default, hopefully conservative |
| 3397 |
* size (usually 512 or the default IP max size, but no more than the mtu |
| 3398 |
* of the interface), as we can't discover anything about intervening |
| 3399 |
* gateways or networks. We also initialize the congestion/slow start |
| 3400 |
* window to be a single segment if the destination isn't local. |
| 3401 |
* While looking at the routing entry, we also initialize other path-dependent |
| 3402 |
* parameters from pre-set or cached values in the routing entry. |
| 3403 |
* |
| 3404 |
* Also take into account the space needed for options that we |
| 3405 |
* send regularly. Make maxseg shorter by that amount to assure |
| 3406 |
* that we can send maxseg amount of data even when the options |
| 3407 |
* are present. Store the upper limit of the length of options plus |
| 3408 |
* data in maxopd. |
| 3409 |
* |
| 3410 |
* NOTE that this routine is only called when we process an incoming |
| 3411 |
* segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS |
| 3412 |
* settings are handled in tcp_mssopt(). |
| 3413 |
*/ |
| 3414 |
void |
| 3415 |
tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, |
| 3416 |
struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) |
| 3417 |
{ |
| 3418 |
int mss = 0; |
| 3419 |
u_long maxmtu = 0; |
| 3420 |
struct inpcb *inp = tp->t_inpcb; |
| 3421 |
struct hc_metrics_lite metrics; |
| 3422 |
int origoffer; |
| 3423 |
#ifdef INET6 |
| 3424 |
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; |
| 3425 |
size_t min_protoh = isipv6 ? |
| 3426 |
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : |
| 3427 |
sizeof (struct tcpiphdr); |
| 3428 |
#else |
| 3429 |
const size_t min_protoh = sizeof(struct tcpiphdr); |
| 3430 |
#endif |
| 3431 |
|
| 3432 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3433 |
|
| 3434 |
if (mtuoffer != -1) { |
| 3435 |
KASSERT(offer == -1, ("%s: conflict", __func__)); |
| 3436 |
offer = mtuoffer - min_protoh; |
| 3437 |
} |
| 3438 |
origoffer = offer; |
| 3439 |
|
| 3440 |
/* Initialize. */ |
| 3441 |
#ifdef INET6 |
| 3442 |
if (isipv6) { |
| 3443 |
maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); |
| 3444 |
tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; |
| 3445 |
} |
| 3446 |
#endif |
| 3447 |
#if defined(INET) && defined(INET6) |
| 3448 |
else |
| 3449 |
#endif |
| 3450 |
#ifdef INET |
| 3451 |
{ |
| 3452 |
maxmtu = tcp_maxmtu(&inp->inp_inc, cap); |
| 3453 |
tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; |
| 3454 |
} |
| 3455 |
#endif |
| 3456 |
|
| 3457 |
/* |
| 3458 |
* No route to sender, stay with default mss and return. |
| 3459 |
*/ |
| 3460 |
if (maxmtu == 0) { |
| 3461 |
/* |
| 3462 |
* In case we return early we need to initialize metrics |
| 3463 |
* to a defined state as tcp_hc_get() would do for us |
| 3464 |
* if there was no cache hit. |
| 3465 |
*/ |
| 3466 |
if (metricptr != NULL) |
| 3467 |
bzero(metricptr, sizeof(struct hc_metrics_lite)); |
| 3468 |
return; |
| 3469 |
} |
| 3470 |
|
| 3471 |
/* What have we got? */ |
| 3472 |
switch (offer) { |
| 3473 |
case 0: |
| 3474 |
/* |
| 3475 |
* Offer == 0 means that there was no MSS on the SYN |
| 3476 |
* segment, in this case we use tcp_mssdflt as |
| 3477 |
* already assigned to t_maxopd above. |
| 3478 |
*/ |
| 3479 |
offer = tp->t_maxopd; |
| 3480 |
break; |
| 3481 |
|
| 3482 |
case -1: |
| 3483 |
/* |
| 3484 |
* Offer == -1 means that we didn't receive SYN yet. |
| 3485 |
*/ |
| 3486 |
/* FALLTHROUGH */ |
| 3487 |
|
| 3488 |
default: |
| 3489 |
/* |
| 3490 |
* Prevent DoS attack with too small MSS. Round up |
| 3491 |
* to at least minmss. |
| 3492 |
*/ |
| 3493 |
offer = max(offer, V_tcp_minmss); |
| 3494 |
} |
| 3495 |
|
| 3496 |
/* |
| 3497 |
* rmx information is now retrieved from tcp_hostcache. |
| 3498 |
*/ |
| 3499 |
tcp_hc_get(&inp->inp_inc, &metrics); |
| 3500 |
if (metricptr != NULL) |
| 3501 |
bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); |
| 3502 |
|
| 3503 |
/* |
| 3504 |
* If there's a discovered mtu int tcp hostcache, use it |
| 3505 |
* else, use the link mtu. |
| 3506 |
*/ |
| 3507 |
if (metrics.rmx_mtu) |
| 3508 |
mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; |
| 3509 |
else { |
| 3510 |
#ifdef INET6 |
| 3511 |
if (isipv6) { |
| 3512 |
mss = maxmtu - min_protoh; |
| 3513 |
if (!V_path_mtu_discovery && |
| 3514 |
!in6_localaddr(&inp->in6p_faddr)) |
| 3515 |
mss = min(mss, V_tcp_v6mssdflt); |
| 3516 |
} |
| 3517 |
#endif |
| 3518 |
#if defined(INET) && defined(INET6) |
| 3519 |
else |
| 3520 |
#endif |
| 3521 |
#ifdef INET |
| 3522 |
{ |
| 3523 |
mss = maxmtu - min_protoh; |
| 3524 |
if (!V_path_mtu_discovery && |
| 3525 |
!in_localaddr(inp->inp_faddr)) |
| 3526 |
mss = min(mss, V_tcp_mssdflt); |
| 3527 |
} |
| 3528 |
#endif |
| 3529 |
/* |
| 3530 |
* XXX - The above conditional (mss = maxmtu - min_protoh) |
| 3531 |
* probably violates the TCP spec. |
| 3532 |
* The problem is that, since we don't know the |
| 3533 |
* other end's MSS, we are supposed to use a conservative |
| 3534 |
* default. But, if we do that, then MTU discovery will |
| 3535 |
* never actually take place, because the conservative |
| 3536 |
* default is much less than the MTUs typically seen |
| 3537 |
* on the Internet today. For the moment, we'll sweep |
| 3538 |
* this under the carpet. |
| 3539 |
* |
| 3540 |
* The conservative default might not actually be a problem |
| 3541 |
* if the only case this occurs is when sending an initial |
| 3542 |
* SYN with options and data to a host we've never talked |
| 3543 |
* to before. Then, they will reply with an MSS value which |
| 3544 |
* will get recorded and the new parameters should get |
| 3545 |
* recomputed. For Further Study. |
| 3546 |
*/ |
| 3547 |
} |
| 3548 |
mss = min(mss, offer); |
| 3549 |
|
| 3550 |
/* |
| 3551 |
* Sanity check: make sure that maxopd will be large |
| 3552 |
* enough to allow some data on segments even if the |
| 3553 |
* all the option space is used (40bytes). Otherwise |
| 3554 |
* funny things may happen in tcp_output. |
| 3555 |
*/ |
| 3556 |
mss = max(mss, 64); |
| 3557 |
|
| 3558 |
/* |
| 3559 |
* maxopd stores the maximum length of data AND options |
| 3560 |
* in a segment; maxseg is the amount of data in a normal |
| 3561 |
* segment. We need to store this value (maxopd) apart |
| 3562 |
* from maxseg, because now every segment carries options |
| 3563 |
* and thus we normally have somewhat less data in segments. |
| 3564 |
*/ |
| 3565 |
tp->t_maxopd = mss; |
| 3566 |
|
| 3567 |
/* |
| 3568 |
* origoffer==-1 indicates that no segments were received yet. |
| 3569 |
* In this case we just guess. |
| 3570 |
*/ |
| 3571 |
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && |
| 3572 |
(origoffer == -1 || |
| 3573 |
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) |
| 3574 |
mss -= TCPOLEN_TSTAMP_APPA; |
| 3575 |
|
| 3576 |
tp->t_maxseg = mss; |
| 3577 |
} |
| 3578 |
|
| 3579 |
void |
| 3580 |
tcp_mss(struct tcpcb *tp, int offer) |
| 3581 |
{ |
| 3582 |
int mss; |
| 3583 |
u_long bufsize; |
| 3584 |
struct inpcb *inp; |
| 3585 |
struct socket *so; |
| 3586 |
struct hc_metrics_lite metrics; |
| 3587 |
struct tcp_ifcap cap; |
| 3588 |
|
| 3589 |
KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); |
| 3590 |
|
| 3591 |
bzero(&cap, sizeof(cap)); |
| 3592 |
tcp_mss_update(tp, offer, -1, &metrics, &cap); |
| 3593 |
|
| 3594 |
mss = tp->t_maxseg; |
| 3595 |
inp = tp->t_inpcb; |
| 3596 |
|
| 3597 |
/* |
| 3598 |
* If there's a pipesize, change the socket buffer to that size, |
| 3599 |
* don't change if sb_hiwat is different than default (then it |
| 3600 |
* has been changed on purpose with setsockopt). |
| 3601 |
* Make the socket buffers an integral number of mss units; |
| 3602 |
* if the mss is larger than the socket buffer, decrease the mss. |
| 3603 |
*/ |
| 3604 |
so = inp->inp_socket; |
| 3605 |
SOCKBUF_LOCK(&so->so_snd); |
| 3606 |
if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) |
| 3607 |
bufsize = metrics.rmx_sendpipe; |
| 3608 |
else |
| 3609 |
bufsize = so->so_snd.sb_hiwat; |
| 3610 |
if (bufsize < mss) |
| 3611 |
mss = bufsize; |
| 3612 |
else { |
| 3613 |
bufsize = roundup(bufsize, mss); |
| 3614 |
if (bufsize > sb_max) |
| 3615 |
bufsize = sb_max; |
| 3616 |
if (bufsize > so->so_snd.sb_hiwat) |
| 3617 |
(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); |
| 3618 |
} |
| 3619 |
SOCKBUF_UNLOCK(&so->so_snd); |
| 3620 |
tp->t_maxseg = mss; |
| 3621 |
|
| 3622 |
SOCKBUF_LOCK(&so->so_rcv); |
| 3623 |
if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) |
| 3624 |
bufsize = metrics.rmx_recvpipe; |
| 3625 |
else |
| 3626 |
bufsize = so->so_rcv.sb_hiwat; |
| 3627 |
if (bufsize > mss) { |
| 3628 |
bufsize = roundup(bufsize, mss); |
| 3629 |
if (bufsize > sb_max) |
| 3630 |
bufsize = sb_max; |
| 3631 |
if (bufsize > so->so_rcv.sb_hiwat) |
| 3632 |
(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); |
| 3633 |
} |
| 3634 |
SOCKBUF_UNLOCK(&so->so_rcv); |
| 3635 |
|
| 3636 |
/* Check the interface for TSO capabilities. */ |
| 3637 |
if (cap.ifcap & CSUM_TSO) { |
| 3638 |
tp->t_flags |= TF_TSO; |
| 3639 |
tp->t_tsomax = cap.tsomax; |
| 3640 |
tp->t_tsomaxsegcount = cap.tsomaxsegcount; |
| 3641 |
tp->t_tsomaxsegsize = cap.tsomaxsegsize; |
| 3642 |
} |
| 3643 |
} |
| 3644 |
|
| 3645 |
/* |
| 3646 |
* Determine the MSS option to send on an outgoing SYN. |
| 3647 |
*/ |
| 3648 |
int |
| 3649 |
tcp_mssopt(struct in_conninfo *inc) |
| 3650 |
{ |
| 3651 |
int mss = 0; |
| 3652 |
u_long maxmtu = 0; |
| 3653 |
u_long thcmtu = 0; |
| 3654 |
size_t min_protoh; |
| 3655 |
|
| 3656 |
KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); |
| 3657 |
|
| 3658 |
#ifdef INET6 |
| 3659 |
if (inc->inc_flags & INC_ISIPV6) { |
| 3660 |
mss = V_tcp_v6mssdflt; |
| 3661 |
maxmtu = tcp_maxmtu6(inc, NULL); |
| 3662 |
min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); |
| 3663 |
} |
| 3664 |
#endif |
| 3665 |
#if defined(INET) && defined(INET6) |
| 3666 |
else |
| 3667 |
#endif |
| 3668 |
#ifdef INET |
| 3669 |
{ |
| 3670 |
mss = V_tcp_mssdflt; |
| 3671 |
maxmtu = tcp_maxmtu(inc, NULL); |
| 3672 |
min_protoh = sizeof(struct tcpiphdr); |
| 3673 |
} |
| 3674 |
#endif |
| 3675 |
#if defined(INET6) || defined(INET) |
| 3676 |
thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ |
| 3677 |
#endif |
| 3678 |
|
| 3679 |
if (maxmtu && thcmtu) |
| 3680 |
mss = min(maxmtu, thcmtu) - min_protoh; |
| 3681 |
else if (maxmtu || thcmtu) |
| 3682 |
mss = max(maxmtu, thcmtu) - min_protoh; |
| 3683 |
|
| 3684 |
return (mss); |
| 3685 |
} |
| 3686 |
|
| 3687 |
|
| 3688 |
/* |
| 3689 |
* On a partial ack arrives, force the retransmission of the |
| 3690 |
* next unacknowledged segment. Do not clear tp->t_dupacks. |
| 3691 |
* By setting snd_nxt to ti_ack, this forces retransmission timer to |
| 3692 |
* be started again. |
| 3693 |
*/ |
| 3694 |
static void |
| 3695 |
tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) |
| 3696 |
{ |
| 3697 |
tcp_seq onxt = tp->snd_nxt; |
| 3698 |
u_long ocwnd = tp->snd_cwnd; |
| 3699 |
|
| 3700 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 3701 |
|
| 3702 |
tcp_timer_activate(tp, TT_REXMT, 0); |
| 3703 |
tp->t_rtttime = 0; |
| 3704 |
tp->snd_nxt = th->th_ack; |
| 3705 |
/* |
| 3706 |
* Set snd_cwnd to one segment beyond acknowledged offset. |
| 3707 |
* (tp->snd_una has not yet been updated when this function is called.) |
| 3708 |
*/ |
| 3709 |
tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); |
| 3710 |
tp->t_flags |= TF_ACKNOW; |
| 3711 |
(void) tcp_output(tp); |
| 3712 |
tp->snd_cwnd = ocwnd; |
| 3713 |
if (SEQ_GT(onxt, tp->snd_nxt)) |
| 3714 |
tp->snd_nxt = onxt; |
| 3715 |
/* |
| 3716 |
* Partial window deflation. Relies on fact that tp->snd_una |
| 3717 |
* not updated yet. |
| 3718 |
*/ |
| 3719 |
if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) |
| 3720 |
tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); |
| 3721 |
else |
| 3722 |
tp->snd_cwnd = 0; |
| 3723 |
tp->snd_cwnd += tp->t_maxseg; |
| 3724 |
} |