| 1 |
/*- |
| 2 |
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 |
| 3 |
* The Regents of the University of California. All rights reserved. |
| 4 |
* |
| 5 |
* Redistribution and use in source and binary forms, with or without |
| 6 |
* modification, are permitted provided that the following conditions |
| 7 |
* are met: |
| 8 |
* 1. Redistributions of source code must retain the above copyright |
| 9 |
* notice, this list of conditions and the following disclaimer. |
| 10 |
* 2. Redistributions in binary form must reproduce the above copyright |
| 11 |
* notice, this list of conditions and the following disclaimer in the |
| 12 |
* documentation and/or other materials provided with the distribution. |
| 13 |
* 4. Neither the name of the University nor the names of its contributors |
| 14 |
* may be used to endorse or promote products derived from this software |
| 15 |
* without specific prior written permission. |
| 16 |
* |
| 17 |
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 18 |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 |
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 21 |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 22 |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 23 |
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 24 |
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 25 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 26 |
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 27 |
* SUCH DAMAGE. |
| 28 |
* |
| 29 |
* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 |
| 30 |
*/ |
| 31 |
|
| 32 |
#include <sys/cdefs.h> |
| 33 |
__FBSDID("$FreeBSD$"); |
| 34 |
|
| 35 |
#include "opt_compat.h" |
| 36 |
#include "opt_inet.h" |
| 37 |
#include "opt_inet6.h" |
| 38 |
#include "opt_ipsec.h" |
| 39 |
#include "opt_tcpdebug.h" |
| 40 |
|
| 41 |
#include <sys/param.h> |
| 42 |
#include <sys/systm.h> |
| 43 |
#include <sys/callout.h> |
| 44 |
#include <sys/eventhandler.h> |
| 45 |
#ifdef TCP_HHOOK |
| 46 |
#include <sys/hhook.h> |
| 47 |
#endif |
| 48 |
#include <sys/kernel.h> |
| 49 |
#ifdef TCP_HHOOK |
| 50 |
#include <sys/khelp.h> |
| 51 |
#endif |
| 52 |
#include <sys/sysctl.h> |
| 53 |
#include <sys/jail.h> |
| 54 |
#include <sys/malloc.h> |
| 55 |
#include <sys/refcount.h> |
| 56 |
#include <sys/mbuf.h> |
| 57 |
#ifdef INET6 |
| 58 |
#include <sys/domain.h> |
| 59 |
#endif |
| 60 |
#include <sys/priv.h> |
| 61 |
#include <sys/proc.h> |
| 62 |
#include <sys/sdt.h> |
| 63 |
#include <sys/socket.h> |
| 64 |
#include <sys/socketvar.h> |
| 65 |
#include <sys/protosw.h> |
| 66 |
#include <sys/random.h> |
| 67 |
|
| 68 |
#include <vm/uma.h> |
| 69 |
|
| 70 |
#include <net/route.h> |
| 71 |
#include <net/if.h> |
| 72 |
#include <net/if_var.h> |
| 73 |
#include <net/vnet.h> |
| 74 |
|
| 75 |
#include <netinet/in.h> |
| 76 |
#include <netinet/in_fib.h> |
| 77 |
#include <netinet/in_kdtrace.h> |
| 78 |
#include <netinet/in_pcb.h> |
| 79 |
#include <netinet/in_systm.h> |
| 80 |
#include <netinet/in_var.h> |
| 81 |
#include <netinet/ip.h> |
| 82 |
#include <netinet/ip_icmp.h> |
| 83 |
#include <netinet/ip_var.h> |
| 84 |
#ifdef INET6 |
| 85 |
#include <netinet/icmp6.h> |
| 86 |
#include <netinet/ip6.h> |
| 87 |
#include <netinet6/in6_fib.h> |
| 88 |
#include <netinet6/in6_pcb.h> |
| 89 |
#include <netinet6/ip6_var.h> |
| 90 |
#include <netinet6/scope6_var.h> |
| 91 |
#include <netinet6/nd6.h> |
| 92 |
#endif |
| 93 |
|
| 94 |
#ifdef TCP_RFC7413 |
| 95 |
#include <netinet/tcp_fastopen.h> |
| 96 |
#endif |
| 97 |
#include <netinet/tcp.h> |
| 98 |
#include <netinet/tcp_fsm.h> |
| 99 |
#include <netinet/tcp_seq.h> |
| 100 |
#include <netinet/tcp_timer.h> |
| 101 |
#include <netinet/tcp_var.h> |
| 102 |
#include <netinet/tcp_syncache.h> |
| 103 |
#include <netinet/cc/cc.h> |
| 104 |
#ifdef INET6 |
| 105 |
#include <netinet6/tcp6_var.h> |
| 106 |
#endif |
| 107 |
#include <netinet/tcpip.h> |
| 108 |
#ifdef TCPPCAP |
| 109 |
#include <netinet/tcp_pcap.h> |
| 110 |
#endif |
| 111 |
#ifdef TCPDEBUG |
| 112 |
#include <netinet/tcp_debug.h> |
| 113 |
#endif |
| 114 |
#ifdef INET6 |
| 115 |
#include <netinet6/ip6protosw.h> |
| 116 |
#endif |
| 117 |
#ifdef TCP_OFFLOAD |
| 118 |
#include <netinet/tcp_offload.h> |
| 119 |
#endif |
| 120 |
|
| 121 |
#ifdef IPSEC |
| 122 |
#include <netipsec/ipsec.h> |
| 123 |
#include <netipsec/xform.h> |
| 124 |
#ifdef INET6 |
| 125 |
#include <netipsec/ipsec6.h> |
| 126 |
#endif |
| 127 |
#include <netipsec/key.h> |
| 128 |
#include <sys/syslog.h> |
| 129 |
#endif /*IPSEC*/ |
| 130 |
|
| 131 |
#include <machine/in_cksum.h> |
| 132 |
#include <sys/md5.h> |
| 133 |
|
| 134 |
#include <security/mac/mac_framework.h> |
| 135 |
|
| 136 |
VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; |
| 137 |
#ifdef INET6 |
| 138 |
VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; |
| 139 |
#endif |
| 140 |
|
| 141 |
struct rwlock tcp_function_lock; |
| 142 |
|
| 143 |
static int |
| 144 |
sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) |
| 145 |
{ |
| 146 |
int error, new; |
| 147 |
|
| 148 |
new = V_tcp_mssdflt; |
| 149 |
error = sysctl_handle_int(oidp, &new, 0, req); |
| 150 |
if (error == 0 && req->newptr) { |
| 151 |
if (new < TCP_MINMSS) |
| 152 |
error = EINVAL; |
| 153 |
else |
| 154 |
V_tcp_mssdflt = new; |
| 155 |
} |
| 156 |
return (error); |
| 157 |
} |
| 158 |
|
| 159 |
SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, |
| 160 |
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, |
| 161 |
&sysctl_net_inet_tcp_mss_check, "I", |
| 162 |
"Default TCP Maximum Segment Size"); |
| 163 |
|
| 164 |
#ifdef INET6 |
| 165 |
static int |
| 166 |
sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) |
| 167 |
{ |
| 168 |
int error, new; |
| 169 |
|
| 170 |
new = V_tcp_v6mssdflt; |
| 171 |
error = sysctl_handle_int(oidp, &new, 0, req); |
| 172 |
if (error == 0 && req->newptr) { |
| 173 |
if (new < TCP_MINMSS) |
| 174 |
error = EINVAL; |
| 175 |
else |
| 176 |
V_tcp_v6mssdflt = new; |
| 177 |
} |
| 178 |
return (error); |
| 179 |
} |
| 180 |
|
| 181 |
SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, |
| 182 |
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, |
| 183 |
&sysctl_net_inet_tcp_mss_v6_check, "I", |
| 184 |
"Default TCP Maximum Segment Size for IPv6"); |
| 185 |
#endif /* INET6 */ |
| 186 |
|
| 187 |
/* |
| 188 |
* Minimum MSS we accept and use. This prevents DoS attacks where |
| 189 |
* we are forced to a ridiculous low MSS like 20 and send hundreds |
| 190 |
* of packets instead of one. The effect scales with the available |
| 191 |
* bandwidth and quickly saturates the CPU and network interface |
| 192 |
* with packet generation and sending. Set to zero to disable MINMSS |
| 193 |
* checking. This setting prevents us from sending too small packets. |
| 194 |
*/ |
| 195 |
VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; |
| 196 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, |
| 197 |
&VNET_NAME(tcp_minmss), 0, |
| 198 |
"Minimum TCP Maximum Segment Size"); |
| 199 |
|
| 200 |
VNET_DEFINE(int, tcp_do_rfc1323) = 1; |
| 201 |
SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, |
| 202 |
&VNET_NAME(tcp_do_rfc1323), 0, |
| 203 |
"Enable rfc1323 (high performance TCP) extensions"); |
| 204 |
|
| 205 |
static int tcp_log_debug = 0; |
| 206 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, |
| 207 |
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); |
| 208 |
|
| 209 |
static int tcp_tcbhashsize; |
| 210 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, |
| 211 |
&tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); |
| 212 |
|
| 213 |
static int do_tcpdrain = 1; |
| 214 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, |
| 215 |
"Enable tcp_drain routine for extra help when low on mbufs"); |
| 216 |
|
| 217 |
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, |
| 218 |
&VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); |
| 219 |
|
| 220 |
static VNET_DEFINE(int, icmp_may_rst) = 1; |
| 221 |
#define V_icmp_may_rst VNET(icmp_may_rst) |
| 222 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, |
| 223 |
&VNET_NAME(icmp_may_rst), 0, |
| 224 |
"Certain ICMP unreachable messages may abort connections in SYN_SENT"); |
| 225 |
|
| 226 |
static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; |
| 227 |
#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) |
| 228 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, |
| 229 |
&VNET_NAME(tcp_isn_reseed_interval), 0, |
| 230 |
"Seconds between reseeding of ISN secret"); |
| 231 |
|
| 232 |
static int tcp_soreceive_stream; |
| 233 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, |
| 234 |
&tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); |
| 235 |
|
| 236 |
#ifdef TCP_SIGNATURE |
| 237 |
static int tcp_sig_checksigs = 1; |
| 238 |
SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW, |
| 239 |
&tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic"); |
| 240 |
#endif |
| 241 |
|
| 242 |
VNET_DEFINE(uma_zone_t, sack_hole_zone); |
| 243 |
#define V_sack_hole_zone VNET(sack_hole_zone) |
| 244 |
|
| 245 |
#ifdef TCP_HHOOK |
| 246 |
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); |
| 247 |
#endif |
| 248 |
|
| 249 |
static struct inpcb *tcp_notify(struct inpcb *, int); |
| 250 |
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); |
| 251 |
static void tcp_mtudisc(struct inpcb *, int); |
| 252 |
static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, |
| 253 |
void *ip4hdr, const void *ip6hdr); |
| 254 |
|
| 255 |
|
| 256 |
static struct tcp_function_block tcp_def_funcblk = { |
| 257 |
"default", |
| 258 |
tcp_output, |
| 259 |
tcp_do_segment, |
| 260 |
tcp_default_ctloutput, |
| 261 |
NULL, |
| 262 |
NULL, |
| 263 |
NULL, |
| 264 |
NULL, |
| 265 |
NULL, |
| 266 |
NULL, |
| 267 |
0, |
| 268 |
0 |
| 269 |
}; |
| 270 |
|
| 271 |
int t_functions_inited = 0; |
| 272 |
struct tcp_funchead t_functions; |
| 273 |
static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; |
| 274 |
|
| 275 |
static void |
| 276 |
init_tcp_functions(void) |
| 277 |
{ |
| 278 |
if (t_functions_inited == 0) { |
| 279 |
TAILQ_INIT(&t_functions); |
| 280 |
rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); |
| 281 |
t_functions_inited = 1; |
| 282 |
} |
| 283 |
} |
| 284 |
|
| 285 |
static struct tcp_function_block * |
| 286 |
find_tcp_functions_locked(struct tcp_function_set *fs) |
| 287 |
{ |
| 288 |
struct tcp_function *f; |
| 289 |
struct tcp_function_block *blk=NULL; |
| 290 |
|
| 291 |
TAILQ_FOREACH(f, &t_functions, tf_next) { |
| 292 |
if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { |
| 293 |
blk = f->tf_fb; |
| 294 |
break; |
| 295 |
} |
| 296 |
} |
| 297 |
return(blk); |
| 298 |
} |
| 299 |
|
| 300 |
static struct tcp_function_block * |
| 301 |
find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) |
| 302 |
{ |
| 303 |
struct tcp_function_block *rblk=NULL; |
| 304 |
struct tcp_function *f; |
| 305 |
|
| 306 |
TAILQ_FOREACH(f, &t_functions, tf_next) { |
| 307 |
if (f->tf_fb == blk) { |
| 308 |
rblk = blk; |
| 309 |
if (s) { |
| 310 |
*s = f; |
| 311 |
} |
| 312 |
break; |
| 313 |
} |
| 314 |
} |
| 315 |
return (rblk); |
| 316 |
} |
| 317 |
|
| 318 |
struct tcp_function_block * |
| 319 |
find_and_ref_tcp_functions(struct tcp_function_set *fs) |
| 320 |
{ |
| 321 |
struct tcp_function_block *blk; |
| 322 |
|
| 323 |
rw_rlock(&tcp_function_lock); |
| 324 |
blk = find_tcp_functions_locked(fs); |
| 325 |
if (blk) |
| 326 |
refcount_acquire(&blk->tfb_refcnt); |
| 327 |
rw_runlock(&tcp_function_lock); |
| 328 |
return(blk); |
| 329 |
} |
| 330 |
|
| 331 |
struct tcp_function_block * |
| 332 |
find_and_ref_tcp_fb(struct tcp_function_block *blk) |
| 333 |
{ |
| 334 |
struct tcp_function_block *rblk; |
| 335 |
|
| 336 |
rw_rlock(&tcp_function_lock); |
| 337 |
rblk = find_tcp_fb_locked(blk, NULL); |
| 338 |
if (rblk) |
| 339 |
refcount_acquire(&rblk->tfb_refcnt); |
| 340 |
rw_runlock(&tcp_function_lock); |
| 341 |
return(rblk); |
| 342 |
} |
| 343 |
|
| 344 |
|
| 345 |
static int |
| 346 |
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) |
| 347 |
{ |
| 348 |
int error=ENOENT; |
| 349 |
struct tcp_function_set fs; |
| 350 |
struct tcp_function_block *blk; |
| 351 |
|
| 352 |
memset(&fs, 0, sizeof(fs)); |
| 353 |
rw_rlock(&tcp_function_lock); |
| 354 |
blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); |
| 355 |
if (blk) { |
| 356 |
/* Found him */ |
| 357 |
strcpy(fs.function_set_name, blk->tfb_tcp_block_name); |
| 358 |
fs.pcbcnt = blk->tfb_refcnt; |
| 359 |
} |
| 360 |
rw_runlock(&tcp_function_lock); |
| 361 |
error = sysctl_handle_string(oidp, fs.function_set_name, |
| 362 |
sizeof(fs.function_set_name), req); |
| 363 |
|
| 364 |
/* Check for error or no change */ |
| 365 |
if (error != 0 || req->newptr == NULL) |
| 366 |
return(error); |
| 367 |
|
| 368 |
rw_wlock(&tcp_function_lock); |
| 369 |
blk = find_tcp_functions_locked(&fs); |
| 370 |
if ((blk == NULL) || |
| 371 |
(blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { |
| 372 |
error = ENOENT; |
| 373 |
goto done; |
| 374 |
} |
| 375 |
tcp_func_set_ptr = blk; |
| 376 |
done: |
| 377 |
rw_wunlock(&tcp_function_lock); |
| 378 |
return (error); |
| 379 |
} |
| 380 |
|
| 381 |
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, |
| 382 |
CTLTYPE_STRING | CTLFLAG_RW, |
| 383 |
NULL, 0, sysctl_net_inet_default_tcp_functions, "A", |
| 384 |
"Set/get the default TCP functions"); |
| 385 |
|
| 386 |
static int |
| 387 |
sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) |
| 388 |
{ |
| 389 |
int error, cnt, linesz; |
| 390 |
struct tcp_function *f; |
| 391 |
char *buffer, *cp; |
| 392 |
size_t bufsz, outsz; |
| 393 |
|
| 394 |
cnt = 0; |
| 395 |
rw_rlock(&tcp_function_lock); |
| 396 |
TAILQ_FOREACH(f, &t_functions, tf_next) { |
| 397 |
cnt++; |
| 398 |
} |
| 399 |
rw_runlock(&tcp_function_lock); |
| 400 |
|
| 401 |
bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; |
| 402 |
buffer = malloc(bufsz, M_TEMP, M_WAITOK); |
| 403 |
|
| 404 |
error = 0; |
| 405 |
cp = buffer; |
| 406 |
|
| 407 |
linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); |
| 408 |
cp += linesz; |
| 409 |
bufsz -= linesz; |
| 410 |
outsz = linesz; |
| 411 |
|
| 412 |
rw_rlock(&tcp_function_lock); |
| 413 |
TAILQ_FOREACH(f, &t_functions, tf_next) { |
| 414 |
linesz = snprintf(cp, bufsz, "%-32s%c %u\n", |
| 415 |
f->tf_fb->tfb_tcp_block_name, |
| 416 |
(f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', |
| 417 |
f->tf_fb->tfb_refcnt); |
| 418 |
if (linesz >= bufsz) { |
| 419 |
error = EOVERFLOW; |
| 420 |
break; |
| 421 |
} |
| 422 |
cp += linesz; |
| 423 |
bufsz -= linesz; |
| 424 |
outsz += linesz; |
| 425 |
} |
| 426 |
rw_runlock(&tcp_function_lock); |
| 427 |
if (error == 0) |
| 428 |
error = sysctl_handle_string(oidp, buffer, outsz + 1, req); |
| 429 |
free(buffer, M_TEMP); |
| 430 |
return (error); |
| 431 |
} |
| 432 |
|
| 433 |
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, |
| 434 |
CTLTYPE_STRING|CTLFLAG_RD, |
| 435 |
NULL, 0, sysctl_net_inet_list_available, "A", |
| 436 |
"list available TCP Function sets"); |
| 437 |
|
| 438 |
/* |
| 439 |
* Target size of TCP PCB hash tables. Must be a power of two. |
| 440 |
* |
| 441 |
* Note that this can be overridden by the kernel environment |
| 442 |
* variable net.inet.tcp.tcbhashsize |
| 443 |
*/ |
| 444 |
#ifndef TCBHASHSIZE |
| 445 |
#define TCBHASHSIZE 0 |
| 446 |
#endif |
| 447 |
|
| 448 |
/* |
| 449 |
* XXX |
| 450 |
* Callouts should be moved into struct tcp directly. They are currently |
| 451 |
* separate because the tcpcb structure is exported to userland for sysctl |
| 452 |
* parsing purposes, which do not know about callouts. |
| 453 |
*/ |
| 454 |
struct tcpcb_mem { |
| 455 |
struct tcpcb tcb; |
| 456 |
struct tcp_timer tt; |
| 457 |
struct cc_var ccv; |
| 458 |
#ifdef TCP_HHOOK |
| 459 |
struct osd osd; |
| 460 |
#endif |
| 461 |
}; |
| 462 |
|
| 463 |
static VNET_DEFINE(uma_zone_t, tcpcb_zone); |
| 464 |
#define V_tcpcb_zone VNET(tcpcb_zone) |
| 465 |
|
| 466 |
MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); |
| 467 |
MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); |
| 468 |
|
| 469 |
static struct mtx isn_mtx; |
| 470 |
|
| 471 |
#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) |
| 472 |
#define ISN_LOCK() mtx_lock(&isn_mtx) |
| 473 |
#define ISN_UNLOCK() mtx_unlock(&isn_mtx) |
| 474 |
|
| 475 |
/* |
| 476 |
* TCP initialization. |
| 477 |
*/ |
| 478 |
static void |
| 479 |
tcp_zone_change(void *tag) |
| 480 |
{ |
| 481 |
|
| 482 |
uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); |
| 483 |
uma_zone_set_max(V_tcpcb_zone, maxsockets); |
| 484 |
tcp_tw_zone_change(); |
| 485 |
} |
| 486 |
|
| 487 |
static int |
| 488 |
tcp_inpcb_init(void *mem, int size, int flags) |
| 489 |
{ |
| 490 |
struct inpcb *inp = mem; |
| 491 |
|
| 492 |
INP_LOCK_INIT(inp, "inp", "tcpinp"); |
| 493 |
return (0); |
| 494 |
} |
| 495 |
|
| 496 |
/* |
| 497 |
* Take a value and get the next power of 2 that doesn't overflow. |
| 498 |
* Used to size the tcp_inpcb hash buckets. |
| 499 |
*/ |
| 500 |
static int |
| 501 |
maketcp_hashsize(int size) |
| 502 |
{ |
| 503 |
int hashsize; |
| 504 |
|
| 505 |
/* |
| 506 |
* auto tune. |
| 507 |
* get the next power of 2 higher than maxsockets. |
| 508 |
*/ |
| 509 |
hashsize = 1 << fls(size); |
| 510 |
/* catch overflow, and just go one power of 2 smaller */ |
| 511 |
if (hashsize < size) { |
| 512 |
hashsize = 1 << (fls(size) - 1); |
| 513 |
} |
| 514 |
return (hashsize); |
| 515 |
} |
| 516 |
|
| 517 |
int |
| 518 |
register_tcp_functions(struct tcp_function_block *blk, int wait) |
| 519 |
{ |
| 520 |
struct tcp_function_block *lblk; |
| 521 |
struct tcp_function *n; |
| 522 |
struct tcp_function_set fs; |
| 523 |
|
| 524 |
if (t_functions_inited == 0) { |
| 525 |
init_tcp_functions(); |
| 526 |
} |
| 527 |
if ((blk->tfb_tcp_output == NULL) || |
| 528 |
(blk->tfb_tcp_do_segment == NULL) || |
| 529 |
(blk->tfb_tcp_ctloutput == NULL) || |
| 530 |
(strlen(blk->tfb_tcp_block_name) == 0)) { |
| 531 |
/* |
| 532 |
* These functions are required and you |
| 533 |
* need a name. |
| 534 |
*/ |
| 535 |
return (EINVAL); |
| 536 |
} |
| 537 |
if (blk->tfb_tcp_timer_stop_all || |
| 538 |
blk->tfb_tcp_timer_activate || |
| 539 |
blk->tfb_tcp_timer_active || |
| 540 |
blk->tfb_tcp_timer_stop) { |
| 541 |
/* |
| 542 |
* If you define one timer function you |
| 543 |
* must have them all. |
| 544 |
*/ |
| 545 |
if ((blk->tfb_tcp_timer_stop_all == NULL) || |
| 546 |
(blk->tfb_tcp_timer_activate == NULL) || |
| 547 |
(blk->tfb_tcp_timer_active == NULL) || |
| 548 |
(blk->tfb_tcp_timer_stop == NULL)) { |
| 549 |
return (EINVAL); |
| 550 |
} |
| 551 |
} |
| 552 |
n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); |
| 553 |
if (n == NULL) { |
| 554 |
return (ENOMEM); |
| 555 |
} |
| 556 |
n->tf_fb = blk; |
| 557 |
strcpy(fs.function_set_name, blk->tfb_tcp_block_name); |
| 558 |
rw_wlock(&tcp_function_lock); |
| 559 |
lblk = find_tcp_functions_locked(&fs); |
| 560 |
if (lblk) { |
| 561 |
/* Duplicate name space not allowed */ |
| 562 |
rw_wunlock(&tcp_function_lock); |
| 563 |
free(n, M_TCPFUNCTIONS); |
| 564 |
return (EALREADY); |
| 565 |
} |
| 566 |
refcount_init(&blk->tfb_refcnt, 0); |
| 567 |
blk->tfb_flags = 0; |
| 568 |
TAILQ_INSERT_TAIL(&t_functions, n, tf_next); |
| 569 |
rw_wunlock(&tcp_function_lock); |
| 570 |
return(0); |
| 571 |
} |
| 572 |
|
| 573 |
int |
| 574 |
deregister_tcp_functions(struct tcp_function_block *blk) |
| 575 |
{ |
| 576 |
struct tcp_function_block *lblk; |
| 577 |
struct tcp_function *f; |
| 578 |
int error=ENOENT; |
| 579 |
|
| 580 |
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { |
| 581 |
/* You can't un-register the default */ |
| 582 |
return (EPERM); |
| 583 |
} |
| 584 |
rw_wlock(&tcp_function_lock); |
| 585 |
if (blk == tcp_func_set_ptr) { |
| 586 |
/* You can't free the current default */ |
| 587 |
rw_wunlock(&tcp_function_lock); |
| 588 |
return (EBUSY); |
| 589 |
} |
| 590 |
if (blk->tfb_refcnt) { |
| 591 |
/* Still tcb attached, mark it. */ |
| 592 |
blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; |
| 593 |
rw_wunlock(&tcp_function_lock); |
| 594 |
return (EBUSY); |
| 595 |
} |
| 596 |
lblk = find_tcp_fb_locked(blk, &f); |
| 597 |
if (lblk) { |
| 598 |
/* Found */ |
| 599 |
TAILQ_REMOVE(&t_functions, f, tf_next); |
| 600 |
f->tf_fb = NULL; |
| 601 |
free(f, M_TCPFUNCTIONS); |
| 602 |
error = 0; |
| 603 |
} |
| 604 |
rw_wunlock(&tcp_function_lock); |
| 605 |
return (error); |
| 606 |
} |
| 607 |
|
| 608 |
void |
| 609 |
tcp_init(void) |
| 610 |
{ |
| 611 |
const char *tcbhash_tuneable; |
| 612 |
int hashsize; |
| 613 |
|
| 614 |
tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; |
| 615 |
|
| 616 |
#ifdef TCP_HHOOK |
| 617 |
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, |
| 618 |
&V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) |
| 619 |
printf("%s: WARNING: unable to register helper hook\n", __func__); |
| 620 |
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, |
| 621 |
&V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) |
| 622 |
printf("%s: WARNING: unable to register helper hook\n", __func__); |
| 623 |
#endif |
| 624 |
hashsize = TCBHASHSIZE; |
| 625 |
TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); |
| 626 |
if (hashsize == 0) { |
| 627 |
/* |
| 628 |
* Auto tune the hash size based on maxsockets. |
| 629 |
* A perfect hash would have a 1:1 mapping |
| 630 |
* (hashsize = maxsockets) however it's been |
| 631 |
* suggested that O(2) average is better. |
| 632 |
*/ |
| 633 |
hashsize = maketcp_hashsize(maxsockets / 4); |
| 634 |
/* |
| 635 |
* Our historical default is 512, |
| 636 |
* do not autotune lower than this. |
| 637 |
*/ |
| 638 |
if (hashsize < 512) |
| 639 |
hashsize = 512; |
| 640 |
if (bootverbose && IS_DEFAULT_VNET(curvnet)) |
| 641 |
printf("%s: %s auto tuned to %d\n", __func__, |
| 642 |
tcbhash_tuneable, hashsize); |
| 643 |
} |
| 644 |
/* |
| 645 |
* We require a hashsize to be a power of two. |
| 646 |
* Previously if it was not a power of two we would just reset it |
| 647 |
* back to 512, which could be a nasty surprise if you did not notice |
| 648 |
* the error message. |
| 649 |
* Instead what we do is clip it to the closest power of two lower |
| 650 |
* than the specified hash value. |
| 651 |
*/ |
| 652 |
if (!powerof2(hashsize)) { |
| 653 |
int oldhashsize = hashsize; |
| 654 |
|
| 655 |
hashsize = maketcp_hashsize(hashsize); |
| 656 |
/* prevent absurdly low value */ |
| 657 |
if (hashsize < 16) |
| 658 |
hashsize = 16; |
| 659 |
printf("%s: WARNING: TCB hash size not a power of 2, " |
| 660 |
"clipped from %d to %d.\n", __func__, oldhashsize, |
| 661 |
hashsize); |
| 662 |
} |
| 663 |
in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, |
| 664 |
"tcp_inpcb", tcp_inpcb_init, NULL, 0, IPI_HASHFIELDS_4TUPLE); |
| 665 |
|
| 666 |
/* |
| 667 |
* These have to be type stable for the benefit of the timers. |
| 668 |
*/ |
| 669 |
V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), |
| 670 |
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); |
| 671 |
uma_zone_set_max(V_tcpcb_zone, maxsockets); |
| 672 |
uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); |
| 673 |
|
| 674 |
tcp_tw_init(); |
| 675 |
syncache_init(); |
| 676 |
tcp_hc_init(); |
| 677 |
|
| 678 |
TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); |
| 679 |
V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), |
| 680 |
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); |
| 681 |
|
| 682 |
/* Skip initialization of globals for non-default instances. */ |
| 683 |
if (!IS_DEFAULT_VNET(curvnet)) |
| 684 |
return; |
| 685 |
|
| 686 |
tcp_reass_global_init(); |
| 687 |
|
| 688 |
/* XXX virtualize those bellow? */ |
| 689 |
tcp_delacktime = TCPTV_DELACK; |
| 690 |
tcp_keepinit = TCPTV_KEEP_INIT; |
| 691 |
tcp_keepidle = TCPTV_KEEP_IDLE; |
| 692 |
tcp_keepintvl = TCPTV_KEEPINTVL; |
| 693 |
tcp_maxpersistidle = TCPTV_KEEP_IDLE; |
| 694 |
tcp_msl = TCPTV_MSL; |
| 695 |
tcp_rexmit_min = TCPTV_MIN; |
| 696 |
if (tcp_rexmit_min < 1) |
| 697 |
tcp_rexmit_min = 1; |
| 698 |
tcp_persmin = TCPTV_PERSMIN; |
| 699 |
tcp_persmax = TCPTV_PERSMAX; |
| 700 |
tcp_rexmit_slop = TCPTV_CPU_VAR; |
| 701 |
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; |
| 702 |
tcp_tcbhashsize = hashsize; |
| 703 |
/* Setup the tcp function block list */ |
| 704 |
init_tcp_functions(); |
| 705 |
register_tcp_functions(&tcp_def_funcblk, M_WAITOK); |
| 706 |
|
| 707 |
if (tcp_soreceive_stream) { |
| 708 |
#ifdef INET |
| 709 |
tcp_usrreqs.pru_soreceive = soreceive_stream; |
| 710 |
#endif |
| 711 |
#ifdef INET6 |
| 712 |
tcp6_usrreqs.pru_soreceive = soreceive_stream; |
| 713 |
#endif /* INET6 */ |
| 714 |
} |
| 715 |
|
| 716 |
#ifdef INET6 |
| 717 |
#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) |
| 718 |
#else /* INET6 */ |
| 719 |
#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) |
| 720 |
#endif /* INET6 */ |
| 721 |
if (max_protohdr < TCP_MINPROTOHDR) |
| 722 |
max_protohdr = TCP_MINPROTOHDR; |
| 723 |
if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) |
| 724 |
panic("tcp_init"); |
| 725 |
#undef TCP_MINPROTOHDR |
| 726 |
|
| 727 |
ISN_LOCK_INIT(); |
| 728 |
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, |
| 729 |
SHUTDOWN_PRI_DEFAULT); |
| 730 |
EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, |
| 731 |
EVENTHANDLER_PRI_ANY); |
| 732 |
#ifdef TCPPCAP |
| 733 |
tcp_pcap_init(); |
| 734 |
#endif |
| 735 |
|
| 736 |
#ifdef TCP_RFC7413 |
| 737 |
tcp_fastopen_init(); |
| 738 |
#endif |
| 739 |
} |
| 740 |
|
| 741 |
#ifdef VIMAGE |
| 742 |
static void |
| 743 |
tcp_destroy(void *unused __unused) |
| 744 |
{ |
| 745 |
int n; |
| 746 |
#ifdef TCP_HHOOK |
| 747 |
int error; |
| 748 |
#endif |
| 749 |
|
| 750 |
/* |
| 751 |
* All our processes are gone, all our sockets should be cleaned |
| 752 |
* up, which means, we should be past the tcp_discardcb() calls. |
| 753 |
* Sleep to let all tcpcb timers really disappear and cleanup. |
| 754 |
*/ |
| 755 |
for (;;) { |
| 756 |
INP_LIST_RLOCK(&V_tcbinfo); |
| 757 |
n = V_tcbinfo.ipi_count; |
| 758 |
INP_LIST_RUNLOCK(&V_tcbinfo); |
| 759 |
if (n == 0) |
| 760 |
break; |
| 761 |
pause("tcpdes", hz / 10); |
| 762 |
} |
| 763 |
tcp_hc_destroy(); |
| 764 |
syncache_destroy(); |
| 765 |
tcp_tw_destroy(); |
| 766 |
in_pcbinfo_destroy(&V_tcbinfo); |
| 767 |
/* tcp_discardcb() clears the sack_holes up. */ |
| 768 |
uma_zdestroy(V_sack_hole_zone); |
| 769 |
uma_zdestroy(V_tcpcb_zone); |
| 770 |
|
| 771 |
#ifdef TCP_RFC7413 |
| 772 |
/* |
| 773 |
* Cannot free the zone until all tcpcbs are released as we attach |
| 774 |
* the allocations to them. |
| 775 |
*/ |
| 776 |
tcp_fastopen_destroy(); |
| 777 |
#endif |
| 778 |
|
| 779 |
#ifdef TCP_HHOOK |
| 780 |
error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); |
| 781 |
if (error != 0) { |
| 782 |
printf("%s: WARNING: unable to deregister helper hook " |
| 783 |
"type=%d, id=%d: error %d returned\n", __func__, |
| 784 |
HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); |
| 785 |
} |
| 786 |
error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); |
| 787 |
if (error != 0) { |
| 788 |
printf("%s: WARNING: unable to deregister helper hook " |
| 789 |
"type=%d, id=%d: error %d returned\n", __func__, |
| 790 |
HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); |
| 791 |
} |
| 792 |
#endif |
| 793 |
} |
| 794 |
VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); |
| 795 |
#endif |
| 796 |
|
| 797 |
void |
| 798 |
tcp_fini(void *xtp) |
| 799 |
{ |
| 800 |
|
| 801 |
} |
| 802 |
|
| 803 |
/* |
| 804 |
* Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. |
| 805 |
* tcp_template used to store this data in mbufs, but we now recopy it out |
| 806 |
* of the tcpcb each time to conserve mbufs. |
| 807 |
*/ |
| 808 |
void |
| 809 |
tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) |
| 810 |
{ |
| 811 |
struct tcphdr *th = (struct tcphdr *)tcp_ptr; |
| 812 |
|
| 813 |
INP_WLOCK_ASSERT(inp); |
| 814 |
|
| 815 |
#ifdef INET6 |
| 816 |
if ((inp->inp_vflag & INP_IPV6) != 0) { |
| 817 |
struct ip6_hdr *ip6; |
| 818 |
|
| 819 |
ip6 = (struct ip6_hdr *)ip_ptr; |
| 820 |
ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | |
| 821 |
(inp->inp_flow & IPV6_FLOWINFO_MASK); |
| 822 |
ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | |
| 823 |
(IPV6_VERSION & IPV6_VERSION_MASK); |
| 824 |
ip6->ip6_nxt = IPPROTO_TCP; |
| 825 |
ip6->ip6_plen = htons(sizeof(struct tcphdr)); |
| 826 |
ip6->ip6_src = inp->in6p_laddr; |
| 827 |
ip6->ip6_dst = inp->in6p_faddr; |
| 828 |
} |
| 829 |
#endif /* INET6 */ |
| 830 |
#if defined(INET6) && defined(INET) |
| 831 |
else |
| 832 |
#endif |
| 833 |
#ifdef INET |
| 834 |
{ |
| 835 |
struct ip *ip; |
| 836 |
|
| 837 |
ip = (struct ip *)ip_ptr; |
| 838 |
ip->ip_v = IPVERSION; |
| 839 |
ip->ip_hl = 5; |
| 840 |
ip->ip_tos = inp->inp_ip_tos; |
| 841 |
ip->ip_len = 0; |
| 842 |
ip->ip_id = 0; |
| 843 |
ip->ip_off = 0; |
| 844 |
ip->ip_ttl = inp->inp_ip_ttl; |
| 845 |
ip->ip_sum = 0; |
| 846 |
ip->ip_p = IPPROTO_TCP; |
| 847 |
ip->ip_src = inp->inp_laddr; |
| 848 |
ip->ip_dst = inp->inp_faddr; |
| 849 |
} |
| 850 |
#endif /* INET */ |
| 851 |
th->th_sport = inp->inp_lport; |
| 852 |
th->th_dport = inp->inp_fport; |
| 853 |
th->th_seq = 0; |
| 854 |
th->th_ack = 0; |
| 855 |
th->th_x2 = 0; |
| 856 |
th->th_off = 5; |
| 857 |
th->th_flags = 0; |
| 858 |
th->th_win = 0; |
| 859 |
th->th_urp = 0; |
| 860 |
th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ |
| 861 |
} |
| 862 |
|
| 863 |
/* |
| 864 |
* Create template to be used to send tcp packets on a connection. |
| 865 |
* Allocates an mbuf and fills in a skeletal tcp/ip header. The only |
| 866 |
* use for this function is in keepalives, which use tcp_respond. |
| 867 |
*/ |
| 868 |
struct tcptemp * |
| 869 |
tcpip_maketemplate(struct inpcb *inp) |
| 870 |
{ |
| 871 |
struct tcptemp *t; |
| 872 |
|
| 873 |
t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); |
| 874 |
if (t == NULL) |
| 875 |
return (NULL); |
| 876 |
tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); |
| 877 |
return (t); |
| 878 |
} |
| 879 |
|
| 880 |
/* |
| 881 |
* Send a single message to the TCP at address specified by |
| 882 |
* the given TCP/IP header. If m == NULL, then we make a copy |
| 883 |
* of the tcpiphdr at th and send directly to the addressed host. |
| 884 |
* This is used to force keep alive messages out using the TCP |
| 885 |
* template for a connection. If flags are given then we send |
| 886 |
* a message back to the TCP which originated the segment th, |
| 887 |
* and discard the mbuf containing it and any other attached mbufs. |
| 888 |
* |
| 889 |
* In any case the ack and sequence number of the transmitted |
| 890 |
* segment are as specified by the parameters. |
| 891 |
* |
| 892 |
* NOTE: If m != NULL, then th must point to *inside* the mbuf. |
| 893 |
*/ |
| 894 |
void |
| 895 |
tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, |
| 896 |
tcp_seq ack, tcp_seq seq, int flags) |
| 897 |
{ |
| 898 |
struct tcpopt to; |
| 899 |
struct inpcb *inp; |
| 900 |
struct ip *ip; |
| 901 |
struct mbuf *optm; |
| 902 |
struct tcphdr *nth; |
| 903 |
u_char *optp; |
| 904 |
#ifdef INET6 |
| 905 |
struct ip6_hdr *ip6; |
| 906 |
int isipv6; |
| 907 |
#endif /* INET6 */ |
| 908 |
int optlen, tlen, win; |
| 909 |
bool incl_opts; |
| 910 |
|
| 911 |
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); |
| 912 |
|
| 913 |
#ifdef INET6 |
| 914 |
isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); |
| 915 |
ip6 = ipgen; |
| 916 |
#endif /* INET6 */ |
| 917 |
ip = ipgen; |
| 918 |
|
| 919 |
if (tp != NULL) { |
| 920 |
inp = tp->t_inpcb; |
| 921 |
KASSERT(inp != NULL, ("tcp control block w/o inpcb")); |
| 922 |
INP_WLOCK_ASSERT(inp); |
| 923 |
} else |
| 924 |
inp = NULL; |
| 925 |
|
| 926 |
incl_opts = false; |
| 927 |
win = 0; |
| 928 |
if (tp != NULL) { |
| 929 |
if (!(flags & TH_RST)) { |
| 930 |
win = sbspace(&inp->inp_socket->so_rcv); |
| 931 |
if (win > TCP_MAXWIN << tp->rcv_scale) |
| 932 |
win = TCP_MAXWIN << tp->rcv_scale; |
| 933 |
} |
| 934 |
if ((tp->t_flags & TF_NOOPT) == 0) |
| 935 |
incl_opts = true; |
| 936 |
} |
| 937 |
if (m == NULL) { |
| 938 |
m = m_gethdr(M_NOWAIT, MT_DATA); |
| 939 |
if (m == NULL) |
| 940 |
return; |
| 941 |
m->m_data += max_linkhdr; |
| 942 |
#ifdef INET6 |
| 943 |
if (isipv6) { |
| 944 |
bcopy((caddr_t)ip6, mtod(m, caddr_t), |
| 945 |
sizeof(struct ip6_hdr)); |
| 946 |
ip6 = mtod(m, struct ip6_hdr *); |
| 947 |
nth = (struct tcphdr *)(ip6 + 1); |
| 948 |
} else |
| 949 |
#endif /* INET6 */ |
| 950 |
{ |
| 951 |
bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); |
| 952 |
ip = mtod(m, struct ip *); |
| 953 |
nth = (struct tcphdr *)(ip + 1); |
| 954 |
} |
| 955 |
bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); |
| 956 |
flags = TH_ACK; |
| 957 |
} else if (!M_WRITABLE(m)) { |
| 958 |
struct mbuf *n; |
| 959 |
|
| 960 |
/* Can't reuse 'm', allocate a new mbuf. */ |
| 961 |
n = m_gethdr(M_NOWAIT, MT_DATA); |
| 962 |
if (n == NULL) { |
| 963 |
m_freem(m); |
| 964 |
return; |
| 965 |
} |
| 966 |
|
| 967 |
if (!m_dup_pkthdr(n, m, M_NOWAIT)) { |
| 968 |
m_freem(m); |
| 969 |
m_freem(n); |
| 970 |
return; |
| 971 |
} |
| 972 |
|
| 973 |
n->m_data += max_linkhdr; |
| 974 |
/* m_len is set later */ |
| 975 |
#define xchg(a,b,type) { type t; t=a; a=b; b=t; } |
| 976 |
#ifdef INET6 |
| 977 |
if (isipv6) { |
| 978 |
bcopy((caddr_t)ip6, mtod(n, caddr_t), |
| 979 |
sizeof(struct ip6_hdr)); |
| 980 |
ip6 = mtod(n, struct ip6_hdr *); |
| 981 |
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); |
| 982 |
nth = (struct tcphdr *)(ip6 + 1); |
| 983 |
} else |
| 984 |
#endif /* INET6 */ |
| 985 |
{ |
| 986 |
bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); |
| 987 |
ip = mtod(n, struct ip *); |
| 988 |
xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); |
| 989 |
nth = (struct tcphdr *)(ip + 1); |
| 990 |
} |
| 991 |
bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); |
| 992 |
xchg(nth->th_dport, nth->th_sport, uint16_t); |
| 993 |
th = nth; |
| 994 |
m_freem(m); |
| 995 |
m = n; |
| 996 |
} else { |
| 997 |
/* |
| 998 |
* reuse the mbuf. |
| 999 |
* XXX MRT We inherit the FIB, which is lucky. |
| 1000 |
*/ |
| 1001 |
m_freem(m->m_next); |
| 1002 |
m->m_next = NULL; |
| 1003 |
m->m_data = (caddr_t)ipgen; |
| 1004 |
/* m_len is set later */ |
| 1005 |
#ifdef INET6 |
| 1006 |
if (isipv6) { |
| 1007 |
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); |
| 1008 |
nth = (struct tcphdr *)(ip6 + 1); |
| 1009 |
} else |
| 1010 |
#endif /* INET6 */ |
| 1011 |
{ |
| 1012 |
xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); |
| 1013 |
nth = (struct tcphdr *)(ip + 1); |
| 1014 |
} |
| 1015 |
if (th != nth) { |
| 1016 |
/* |
| 1017 |
* this is usually a case when an extension header |
| 1018 |
* exists between the IPv6 header and the |
| 1019 |
* TCP header. |
| 1020 |
*/ |
| 1021 |
nth->th_sport = th->th_sport; |
| 1022 |
nth->th_dport = th->th_dport; |
| 1023 |
} |
| 1024 |
xchg(nth->th_dport, nth->th_sport, uint16_t); |
| 1025 |
#undef xchg |
| 1026 |
} |
| 1027 |
tlen = 0; |
| 1028 |
#ifdef INET6 |
| 1029 |
if (isipv6) |
| 1030 |
tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); |
| 1031 |
#endif |
| 1032 |
#if defined(INET) && defined(INET6) |
| 1033 |
else |
| 1034 |
#endif |
| 1035 |
#ifdef INET |
| 1036 |
tlen = sizeof (struct tcpiphdr); |
| 1037 |
#endif |
| 1038 |
#ifdef INVARIANTS |
| 1039 |
m->m_len = 0; |
| 1040 |
KASSERT(M_TRAILINGSPACE(m) >= tlen, |
| 1041 |
("Not enough trailing space for message (m=%p, need=%d, have=%ld)", |
| 1042 |
m, tlen, (long)M_TRAILINGSPACE(m))); |
| 1043 |
#endif |
| 1044 |
m->m_len = tlen; |
| 1045 |
to.to_flags = 0; |
| 1046 |
if (incl_opts) { |
| 1047 |
/* Make sure we have room. */ |
| 1048 |
if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { |
| 1049 |
m->m_next = m_get(M_NOWAIT, MT_DATA); |
| 1050 |
if (m->m_next) { |
| 1051 |
optp = mtod(m->m_next, u_char *); |
| 1052 |
optm = m->m_next; |
| 1053 |
} else |
| 1054 |
incl_opts = false; |
| 1055 |
} else { |
| 1056 |
optp = (u_char *) (nth + 1); |
| 1057 |
optm = m; |
| 1058 |
} |
| 1059 |
} |
| 1060 |
if (incl_opts) { |
| 1061 |
/* Timestamps. */ |
| 1062 |
if (tp->t_flags & TF_RCVD_TSTMP) { |
| 1063 |
to.to_tsval = tcp_ts_getticks() + tp->ts_offset; |
| 1064 |
to.to_tsecr = tp->ts_recent; |
| 1065 |
to.to_flags |= TOF_TS; |
| 1066 |
} |
| 1067 |
#ifdef TCP_SIGNATURE |
| 1068 |
/* TCP-MD5 (RFC2385). */ |
| 1069 |
if (tp->t_flags & TF_SIGNATURE) |
| 1070 |
to.to_flags |= TOF_SIGNATURE; |
| 1071 |
#endif |
| 1072 |
|
| 1073 |
/* Add the options. */ |
| 1074 |
tlen += optlen = tcp_addoptions(&to, optp); |
| 1075 |
|
| 1076 |
/* Update m_len in the correct mbuf. */ |
| 1077 |
optm->m_len += optlen; |
| 1078 |
} else |
| 1079 |
optlen = 0; |
| 1080 |
#ifdef INET6 |
| 1081 |
if (isipv6) { |
| 1082 |
ip6->ip6_flow = 0; |
| 1083 |
ip6->ip6_vfc = IPV6_VERSION; |
| 1084 |
ip6->ip6_nxt = IPPROTO_TCP; |
| 1085 |
ip6->ip6_plen = htons(tlen - sizeof(*ip6)); |
| 1086 |
} |
| 1087 |
#endif |
| 1088 |
#if defined(INET) && defined(INET6) |
| 1089 |
else |
| 1090 |
#endif |
| 1091 |
#ifdef INET |
| 1092 |
{ |
| 1093 |
ip->ip_len = htons(tlen); |
| 1094 |
ip->ip_ttl = V_ip_defttl; |
| 1095 |
if (V_path_mtu_discovery) |
| 1096 |
ip->ip_off |= htons(IP_DF); |
| 1097 |
} |
| 1098 |
#endif |
| 1099 |
m->m_pkthdr.len = tlen; |
| 1100 |
m->m_pkthdr.rcvif = NULL; |
| 1101 |
#ifdef MAC |
| 1102 |
if (inp != NULL) { |
| 1103 |
/* |
| 1104 |
* Packet is associated with a socket, so allow the |
| 1105 |
* label of the response to reflect the socket label. |
| 1106 |
*/ |
| 1107 |
INP_WLOCK_ASSERT(inp); |
| 1108 |
mac_inpcb_create_mbuf(inp, m); |
| 1109 |
} else { |
| 1110 |
/* |
| 1111 |
* Packet is not associated with a socket, so possibly |
| 1112 |
* update the label in place. |
| 1113 |
*/ |
| 1114 |
mac_netinet_tcp_reply(m); |
| 1115 |
} |
| 1116 |
#endif |
| 1117 |
nth->th_seq = htonl(seq); |
| 1118 |
nth->th_ack = htonl(ack); |
| 1119 |
nth->th_x2 = 0; |
| 1120 |
nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; |
| 1121 |
nth->th_flags = flags; |
| 1122 |
if (tp != NULL) |
| 1123 |
nth->th_win = htons((u_short) (win >> tp->rcv_scale)); |
| 1124 |
else |
| 1125 |
nth->th_win = htons((u_short)win); |
| 1126 |
nth->th_urp = 0; |
| 1127 |
|
| 1128 |
#ifdef TCP_SIGNATURE |
| 1129 |
if (to.to_flags & TOF_SIGNATURE) { |
| 1130 |
tcp_signature_compute(m, 0, 0, optlen, to.to_signature, |
| 1131 |
IPSEC_DIR_OUTBOUND); |
| 1132 |
} |
| 1133 |
#endif |
| 1134 |
|
| 1135 |
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); |
| 1136 |
#ifdef INET6 |
| 1137 |
if (isipv6) { |
| 1138 |
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; |
| 1139 |
nth->th_sum = in6_cksum_pseudo(ip6, |
| 1140 |
tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); |
| 1141 |
ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : |
| 1142 |
NULL, NULL); |
| 1143 |
} |
| 1144 |
#endif /* INET6 */ |
| 1145 |
#if defined(INET6) && defined(INET) |
| 1146 |
else |
| 1147 |
#endif |
| 1148 |
#ifdef INET |
| 1149 |
{ |
| 1150 |
m->m_pkthdr.csum_flags = CSUM_TCP; |
| 1151 |
nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, |
| 1152 |
htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); |
| 1153 |
} |
| 1154 |
#endif /* INET */ |
| 1155 |
#ifdef TCPDEBUG |
| 1156 |
if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) |
| 1157 |
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); |
| 1158 |
#endif |
| 1159 |
TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); |
| 1160 |
if (flags & TH_RST) |
| 1161 |
TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), |
| 1162 |
tp, nth); |
| 1163 |
|
| 1164 |
TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); |
| 1165 |
#ifdef INET6 |
| 1166 |
if (isipv6) |
| 1167 |
(void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); |
| 1168 |
#endif /* INET6 */ |
| 1169 |
#if defined(INET) && defined(INET6) |
| 1170 |
else |
| 1171 |
#endif |
| 1172 |
#ifdef INET |
| 1173 |
(void) ip_output(m, NULL, NULL, 0, NULL, inp); |
| 1174 |
#endif |
| 1175 |
} |
| 1176 |
|
| 1177 |
/* |
| 1178 |
* Create a new TCP control block, making an |
| 1179 |
* empty reassembly queue and hooking it to the argument |
| 1180 |
* protocol control block. The `inp' parameter must have |
| 1181 |
* come from the zone allocator set up in tcp_init(). |
| 1182 |
*/ |
| 1183 |
struct tcpcb * |
| 1184 |
tcp_newtcpcb(struct inpcb *inp) |
| 1185 |
{ |
| 1186 |
struct tcpcb_mem *tm; |
| 1187 |
struct tcpcb *tp; |
| 1188 |
#ifdef INET6 |
| 1189 |
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; |
| 1190 |
#endif /* INET6 */ |
| 1191 |
|
| 1192 |
tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); |
| 1193 |
if (tm == NULL) |
| 1194 |
return (NULL); |
| 1195 |
tp = &tm->tcb; |
| 1196 |
|
| 1197 |
/* Initialise cc_var struct for this tcpcb. */ |
| 1198 |
tp->ccv = &tm->ccv; |
| 1199 |
tp->ccv->type = IPPROTO_TCP; |
| 1200 |
tp->ccv->ccvc.tcp = tp; |
| 1201 |
rw_rlock(&tcp_function_lock); |
| 1202 |
tp->t_fb = tcp_func_set_ptr; |
| 1203 |
refcount_acquire(&tp->t_fb->tfb_refcnt); |
| 1204 |
rw_runlock(&tcp_function_lock); |
| 1205 |
/* |
| 1206 |
* Use the current system default CC algorithm. |
| 1207 |
*/ |
| 1208 |
CC_LIST_RLOCK(); |
| 1209 |
KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); |
| 1210 |
CC_ALGO(tp) = CC_DEFAULT(); |
| 1211 |
CC_LIST_RUNLOCK(); |
| 1212 |
|
| 1213 |
if (CC_ALGO(tp)->cb_init != NULL) |
| 1214 |
if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { |
| 1215 |
if (tp->t_fb->tfb_tcp_fb_fini) |
| 1216 |
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); |
| 1217 |
refcount_release(&tp->t_fb->tfb_refcnt); |
| 1218 |
uma_zfree(V_tcpcb_zone, tm); |
| 1219 |
return (NULL); |
| 1220 |
} |
| 1221 |
|
| 1222 |
#ifdef TCP_HHOOK |
| 1223 |
tp->osd = &tm->osd; |
| 1224 |
if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { |
| 1225 |
if (tp->t_fb->tfb_tcp_fb_fini) |
| 1226 |
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); |
| 1227 |
refcount_release(&tp->t_fb->tfb_refcnt); |
| 1228 |
uma_zfree(V_tcpcb_zone, tm); |
| 1229 |
return (NULL); |
| 1230 |
} |
| 1231 |
#endif |
| 1232 |
|
| 1233 |
#ifdef VIMAGE |
| 1234 |
tp->t_vnet = inp->inp_vnet; |
| 1235 |
#endif |
| 1236 |
tp->t_timers = &tm->tt; |
| 1237 |
/* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ |
| 1238 |
tp->t_maxseg = |
| 1239 |
#ifdef INET6 |
| 1240 |
isipv6 ? V_tcp_v6mssdflt : |
| 1241 |
#endif /* INET6 */ |
| 1242 |
V_tcp_mssdflt; |
| 1243 |
|
| 1244 |
/* Set up our timeouts. */ |
| 1245 |
callout_init(&tp->t_timers->tt_rexmt, 1); |
| 1246 |
callout_init(&tp->t_timers->tt_persist, 1); |
| 1247 |
callout_init(&tp->t_timers->tt_keep, 1); |
| 1248 |
callout_init(&tp->t_timers->tt_2msl, 1); |
| 1249 |
callout_init(&tp->t_timers->tt_delack, 1); |
| 1250 |
|
| 1251 |
if (V_tcp_do_rfc1323) |
| 1252 |
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); |
| 1253 |
if (V_tcp_do_sack) |
| 1254 |
tp->t_flags |= TF_SACK_PERMIT; |
| 1255 |
TAILQ_INIT(&tp->snd_holes); |
| 1256 |
/* |
| 1257 |
* The tcpcb will hold a reference on its inpcb until tcp_discardcb() |
| 1258 |
* is called. |
| 1259 |
*/ |
| 1260 |
in_pcbref(inp); /* Reference for tcpcb */ |
| 1261 |
tp->t_inpcb = inp; |
| 1262 |
|
| 1263 |
/* |
| 1264 |
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no |
| 1265 |
* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives |
| 1266 |
* reasonable initial retransmit time. |
| 1267 |
*/ |
| 1268 |
tp->t_srtt = TCPTV_SRTTBASE; |
| 1269 |
tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; |
| 1270 |
tp->t_rttmin = tcp_rexmit_min; |
| 1271 |
tp->t_rxtcur = TCPTV_RTOBASE; |
| 1272 |
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
| 1273 |
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
| 1274 |
tp->t_rcvtime = ticks; |
| 1275 |
/* |
| 1276 |
* IPv4 TTL initialization is necessary for an IPv6 socket as well, |
| 1277 |
* because the socket may be bound to an IPv6 wildcard address, |
| 1278 |
* which may match an IPv4-mapped IPv6 address. |
| 1279 |
*/ |
| 1280 |
inp->inp_ip_ttl = V_ip_defttl; |
| 1281 |
inp->inp_ppcb = tp; |
| 1282 |
#ifdef TCPPCAP |
| 1283 |
/* |
| 1284 |
* Init the TCP PCAP queues. |
| 1285 |
*/ |
| 1286 |
tcp_pcap_tcpcb_init(tp); |
| 1287 |
#endif |
| 1288 |
if (tp->t_fb->tfb_tcp_fb_init) { |
| 1289 |
(*tp->t_fb->tfb_tcp_fb_init)(tp); |
| 1290 |
} |
| 1291 |
return (tp); /* XXX */ |
| 1292 |
} |
| 1293 |
|
| 1294 |
/* |
| 1295 |
* Switch the congestion control algorithm back to NewReno for any active |
| 1296 |
* control blocks using an algorithm which is about to go away. |
| 1297 |
* This ensures the CC framework can allow the unload to proceed without leaving |
| 1298 |
* any dangling pointers which would trigger a panic. |
| 1299 |
* Returning non-zero would inform the CC framework that something went wrong |
| 1300 |
* and it would be unsafe to allow the unload to proceed. However, there is no |
| 1301 |
* way for this to occur with this implementation so we always return zero. |
| 1302 |
*/ |
| 1303 |
int |
| 1304 |
tcp_ccalgounload(struct cc_algo *unload_algo) |
| 1305 |
{ |
| 1306 |
struct cc_algo *tmpalgo; |
| 1307 |
struct inpcb *inp; |
| 1308 |
struct tcpcb *tp; |
| 1309 |
VNET_ITERATOR_DECL(vnet_iter); |
| 1310 |
|
| 1311 |
/* |
| 1312 |
* Check all active control blocks across all network stacks and change |
| 1313 |
* any that are using "unload_algo" back to NewReno. If "unload_algo" |
| 1314 |
* requires cleanup code to be run, call it. |
| 1315 |
*/ |
| 1316 |
VNET_LIST_RLOCK(); |
| 1317 |
VNET_FOREACH(vnet_iter) { |
| 1318 |
CURVNET_SET(vnet_iter); |
| 1319 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 1320 |
/* |
| 1321 |
* New connections already part way through being initialised |
| 1322 |
* with the CC algo we're removing will not race with this code |
| 1323 |
* because the INP_INFO_WLOCK is held during initialisation. We |
| 1324 |
* therefore don't enter the loop below until the connection |
| 1325 |
* list has stabilised. |
| 1326 |
*/ |
| 1327 |
LIST_FOREACH(inp, &V_tcb, inp_list) { |
| 1328 |
INP_WLOCK(inp); |
| 1329 |
/* Important to skip tcptw structs. */ |
| 1330 |
if (!(inp->inp_flags & INP_TIMEWAIT) && |
| 1331 |
(tp = intotcpcb(inp)) != NULL) { |
| 1332 |
/* |
| 1333 |
* By holding INP_WLOCK here, we are assured |
| 1334 |
* that the connection is not currently |
| 1335 |
* executing inside the CC module's functions |
| 1336 |
* i.e. it is safe to make the switch back to |
| 1337 |
* NewReno. |
| 1338 |
*/ |
| 1339 |
if (CC_ALGO(tp) == unload_algo) { |
| 1340 |
tmpalgo = CC_ALGO(tp); |
| 1341 |
/* NewReno does not require any init. */ |
| 1342 |
CC_ALGO(tp) = &newreno_cc_algo; |
| 1343 |
if (tmpalgo->cb_destroy != NULL) |
| 1344 |
tmpalgo->cb_destroy(tp->ccv); |
| 1345 |
} |
| 1346 |
} |
| 1347 |
INP_WUNLOCK(inp); |
| 1348 |
} |
| 1349 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1350 |
CURVNET_RESTORE(); |
| 1351 |
} |
| 1352 |
VNET_LIST_RUNLOCK(); |
| 1353 |
|
| 1354 |
return (0); |
| 1355 |
} |
| 1356 |
|
| 1357 |
/* |
| 1358 |
* Drop a TCP connection, reporting |
| 1359 |
* the specified error. If connection is synchronized, |
| 1360 |
* then send a RST to peer. |
| 1361 |
*/ |
| 1362 |
struct tcpcb * |
| 1363 |
tcp_drop(struct tcpcb *tp, int errno) |
| 1364 |
{ |
| 1365 |
struct socket *so = tp->t_inpcb->inp_socket; |
| 1366 |
|
| 1367 |
INP_INFO_LOCK_ASSERT(&V_tcbinfo); |
| 1368 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 1369 |
|
| 1370 |
if (TCPS_HAVERCVDSYN(tp->t_state)) { |
| 1371 |
tcp_state_change(tp, TCPS_CLOSED); |
| 1372 |
(void) tp->t_fb->tfb_tcp_output(tp); |
| 1373 |
TCPSTAT_INC(tcps_drops); |
| 1374 |
} else |
| 1375 |
TCPSTAT_INC(tcps_conndrops); |
| 1376 |
if (errno == ETIMEDOUT && tp->t_softerror) |
| 1377 |
errno = tp->t_softerror; |
| 1378 |
so->so_error = errno; |
| 1379 |
return (tcp_close(tp)); |
| 1380 |
} |
| 1381 |
|
| 1382 |
void |
| 1383 |
tcp_discardcb(struct tcpcb *tp) |
| 1384 |
{ |
| 1385 |
struct inpcb *inp = tp->t_inpcb; |
| 1386 |
struct socket *so = inp->inp_socket; |
| 1387 |
#ifdef INET6 |
| 1388 |
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; |
| 1389 |
#endif /* INET6 */ |
| 1390 |
int released; |
| 1391 |
|
| 1392 |
INP_WLOCK_ASSERT(inp); |
| 1393 |
|
| 1394 |
/* |
| 1395 |
* Make sure that all of our timers are stopped before we delete the |
| 1396 |
* PCB. |
| 1397 |
* |
| 1398 |
* If stopping a timer fails, we schedule a discard function in same |
| 1399 |
* callout, and the last discard function called will take care of |
| 1400 |
* deleting the tcpcb. |
| 1401 |
*/ |
| 1402 |
tp->t_timers->tt_draincnt = 0; |
| 1403 |
tcp_timer_stop(tp, TT_REXMT); |
| 1404 |
tcp_timer_stop(tp, TT_PERSIST); |
| 1405 |
tcp_timer_stop(tp, TT_KEEP); |
| 1406 |
tcp_timer_stop(tp, TT_2MSL); |
| 1407 |
tcp_timer_stop(tp, TT_DELACK); |
| 1408 |
if (tp->t_fb->tfb_tcp_timer_stop_all) { |
| 1409 |
/* |
| 1410 |
* Call the stop-all function of the methods, |
| 1411 |
* this function should call the tcp_timer_stop() |
| 1412 |
* method with each of the function specific timeouts. |
| 1413 |
* That stop will be called via the tfb_tcp_timer_stop() |
| 1414 |
* which should use the async drain function of the |
| 1415 |
* callout system (see tcp_var.h). |
| 1416 |
*/ |
| 1417 |
tp->t_fb->tfb_tcp_timer_stop_all(tp); |
| 1418 |
} |
| 1419 |
|
| 1420 |
/* |
| 1421 |
* If we got enough samples through the srtt filter, |
| 1422 |
* save the rtt and rttvar in the routing entry. |
| 1423 |
* 'Enough' is arbitrarily defined as 4 rtt samples. |
| 1424 |
* 4 samples is enough for the srtt filter to converge |
| 1425 |
* to within enough % of the correct value; fewer samples |
| 1426 |
* and we could save a bogus rtt. The danger is not high |
| 1427 |
* as tcp quickly recovers from everything. |
| 1428 |
* XXX: Works very well but needs some more statistics! |
| 1429 |
*/ |
| 1430 |
if (tp->t_rttupdated >= 4) { |
| 1431 |
struct hc_metrics_lite metrics; |
| 1432 |
uint32_t ssthresh; |
| 1433 |
|
| 1434 |
bzero(&metrics, sizeof(metrics)); |
| 1435 |
/* |
| 1436 |
* Update the ssthresh always when the conditions below |
| 1437 |
* are satisfied. This gives us better new start value |
| 1438 |
* for the congestion avoidance for new connections. |
| 1439 |
* ssthresh is only set if packet loss occurred on a session. |
| 1440 |
* |
| 1441 |
* XXXRW: 'so' may be NULL here, and/or socket buffer may be |
| 1442 |
* being torn down. Ideally this code would not use 'so'. |
| 1443 |
*/ |
| 1444 |
ssthresh = tp->snd_ssthresh; |
| 1445 |
if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { |
| 1446 |
/* |
| 1447 |
* convert the limit from user data bytes to |
| 1448 |
* packets then to packet data bytes. |
| 1449 |
*/ |
| 1450 |
ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; |
| 1451 |
if (ssthresh < 2) |
| 1452 |
ssthresh = 2; |
| 1453 |
ssthresh *= (tp->t_maxseg + |
| 1454 |
#ifdef INET6 |
| 1455 |
(isipv6 ? sizeof (struct ip6_hdr) + |
| 1456 |
sizeof (struct tcphdr) : |
| 1457 |
#endif |
| 1458 |
sizeof (struct tcpiphdr) |
| 1459 |
#ifdef INET6 |
| 1460 |
) |
| 1461 |
#endif |
| 1462 |
); |
| 1463 |
} else |
| 1464 |
ssthresh = 0; |
| 1465 |
metrics.rmx_ssthresh = ssthresh; |
| 1466 |
|
| 1467 |
metrics.rmx_rtt = tp->t_srtt; |
| 1468 |
metrics.rmx_rttvar = tp->t_rttvar; |
| 1469 |
metrics.rmx_cwnd = tp->snd_cwnd; |
| 1470 |
metrics.rmx_sendpipe = 0; |
| 1471 |
metrics.rmx_recvpipe = 0; |
| 1472 |
|
| 1473 |
tcp_hc_update(&inp->inp_inc, &metrics); |
| 1474 |
} |
| 1475 |
|
| 1476 |
/* free the reassembly queue, if any */ |
| 1477 |
tcp_reass_flush(tp); |
| 1478 |
|
| 1479 |
#ifdef TCP_OFFLOAD |
| 1480 |
/* Disconnect offload device, if any. */ |
| 1481 |
if (tp->t_flags & TF_TOE) |
| 1482 |
tcp_offload_detach(tp); |
| 1483 |
#endif |
| 1484 |
|
| 1485 |
tcp_free_sackholes(tp); |
| 1486 |
|
| 1487 |
#ifdef TCPPCAP |
| 1488 |
/* Free the TCP PCAP queues. */ |
| 1489 |
tcp_pcap_drain(&(tp->t_inpkts)); |
| 1490 |
tcp_pcap_drain(&(tp->t_outpkts)); |
| 1491 |
#endif |
| 1492 |
|
| 1493 |
/* Allow the CC algorithm to clean up after itself. */ |
| 1494 |
if (CC_ALGO(tp)->cb_destroy != NULL) |
| 1495 |
CC_ALGO(tp)->cb_destroy(tp->ccv); |
| 1496 |
|
| 1497 |
#ifdef TCP_HHOOK |
| 1498 |
khelp_destroy_osd(tp->osd); |
| 1499 |
#endif |
| 1500 |
|
| 1501 |
CC_ALGO(tp) = NULL; |
| 1502 |
inp->inp_ppcb = NULL; |
| 1503 |
if (tp->t_timers->tt_draincnt == 0) { |
| 1504 |
/* We own the last reference on tcpcb, let's free it. */ |
| 1505 |
if (tp->t_fb->tfb_tcp_fb_fini) |
| 1506 |
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); |
| 1507 |
refcount_release(&tp->t_fb->tfb_refcnt); |
| 1508 |
tp->t_inpcb = NULL; |
| 1509 |
uma_zfree(V_tcpcb_zone, tp); |
| 1510 |
released = in_pcbrele_wlocked(inp); |
| 1511 |
KASSERT(!released, ("%s: inp %p should not have been released " |
| 1512 |
"here", __func__, inp)); |
| 1513 |
} |
| 1514 |
} |
| 1515 |
|
| 1516 |
void |
| 1517 |
tcp_timer_discard(void *ptp) |
| 1518 |
{ |
| 1519 |
struct inpcb *inp; |
| 1520 |
struct tcpcb *tp; |
| 1521 |
|
| 1522 |
tp = (struct tcpcb *)ptp; |
| 1523 |
CURVNET_SET(tp->t_vnet); |
| 1524 |
INP_INFO_RLOCK(&V_tcbinfo); |
| 1525 |
inp = tp->t_inpcb; |
| 1526 |
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", |
| 1527 |
__func__, tp)); |
| 1528 |
INP_WLOCK(inp); |
| 1529 |
KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, |
| 1530 |
("%s: tcpcb has to be stopped here", __func__)); |
| 1531 |
tp->t_timers->tt_draincnt--; |
| 1532 |
if (tp->t_timers->tt_draincnt == 0) { |
| 1533 |
/* We own the last reference on this tcpcb, let's free it. */ |
| 1534 |
if (tp->t_fb->tfb_tcp_fb_fini) |
| 1535 |
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); |
| 1536 |
refcount_release(&tp->t_fb->tfb_refcnt); |
| 1537 |
tp->t_inpcb = NULL; |
| 1538 |
uma_zfree(V_tcpcb_zone, tp); |
| 1539 |
if (in_pcbrele_wlocked(inp)) { |
| 1540 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 1541 |
CURVNET_RESTORE(); |
| 1542 |
return; |
| 1543 |
} |
| 1544 |
} |
| 1545 |
INP_WUNLOCK(inp); |
| 1546 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 1547 |
CURVNET_RESTORE(); |
| 1548 |
} |
| 1549 |
|
| 1550 |
/* |
| 1551 |
* Attempt to close a TCP control block, marking it as dropped, and freeing |
| 1552 |
* the socket if we hold the only reference. |
| 1553 |
*/ |
| 1554 |
struct tcpcb * |
| 1555 |
tcp_close(struct tcpcb *tp) |
| 1556 |
{ |
| 1557 |
struct inpcb *inp = tp->t_inpcb; |
| 1558 |
struct socket *so; |
| 1559 |
|
| 1560 |
INP_INFO_LOCK_ASSERT(&V_tcbinfo); |
| 1561 |
INP_WLOCK_ASSERT(inp); |
| 1562 |
|
| 1563 |
#ifdef TCP_OFFLOAD |
| 1564 |
if (tp->t_state == TCPS_LISTEN) |
| 1565 |
tcp_offload_listen_stop(tp); |
| 1566 |
#endif |
| 1567 |
#ifdef TCP_RFC7413 |
| 1568 |
/* |
| 1569 |
* This releases the TFO pending counter resource for TFO listen |
| 1570 |
* sockets as well as passively-created TFO sockets that transition |
| 1571 |
* from SYN_RECEIVED to CLOSED. |
| 1572 |
*/ |
| 1573 |
if (tp->t_tfo_pending) { |
| 1574 |
tcp_fastopen_decrement_counter(tp->t_tfo_pending); |
| 1575 |
tp->t_tfo_pending = NULL; |
| 1576 |
} |
| 1577 |
#endif |
| 1578 |
in_pcbdrop(inp); |
| 1579 |
TCPSTAT_INC(tcps_closed); |
| 1580 |
TCPSTATES_DEC(tp->t_state); |
| 1581 |
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); |
| 1582 |
so = inp->inp_socket; |
| 1583 |
soisdisconnected(so); |
| 1584 |
if (inp->inp_flags & INP_SOCKREF) { |
| 1585 |
KASSERT(so->so_state & SS_PROTOREF, |
| 1586 |
("tcp_close: !SS_PROTOREF")); |
| 1587 |
inp->inp_flags &= ~INP_SOCKREF; |
| 1588 |
INP_WUNLOCK(inp); |
| 1589 |
ACCEPT_LOCK(); |
| 1590 |
SOCK_LOCK(so); |
| 1591 |
so->so_state &= ~SS_PROTOREF; |
| 1592 |
sofree(so); |
| 1593 |
return (NULL); |
| 1594 |
} |
| 1595 |
return (tp); |
| 1596 |
} |
| 1597 |
|
| 1598 |
void |
| 1599 |
tcp_drain(void) |
| 1600 |
{ |
| 1601 |
VNET_ITERATOR_DECL(vnet_iter); |
| 1602 |
|
| 1603 |
if (!do_tcpdrain) |
| 1604 |
return; |
| 1605 |
|
| 1606 |
VNET_LIST_RLOCK_NOSLEEP(); |
| 1607 |
VNET_FOREACH(vnet_iter) { |
| 1608 |
CURVNET_SET(vnet_iter); |
| 1609 |
struct inpcb *inpb; |
| 1610 |
struct tcpcb *tcpb; |
| 1611 |
|
| 1612 |
/* |
| 1613 |
* Walk the tcpbs, if existing, and flush the reassembly queue, |
| 1614 |
* if there is one... |
| 1615 |
* XXX: The "Net/3" implementation doesn't imply that the TCP |
| 1616 |
* reassembly queue should be flushed, but in a situation |
| 1617 |
* where we're really low on mbufs, this is potentially |
| 1618 |
* useful. |
| 1619 |
*/ |
| 1620 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 1621 |
LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { |
| 1622 |
if (inpb->inp_flags & INP_TIMEWAIT) |
| 1623 |
continue; |
| 1624 |
INP_WLOCK(inpb); |
| 1625 |
if ((tcpb = intotcpcb(inpb)) != NULL) { |
| 1626 |
tcp_reass_flush(tcpb); |
| 1627 |
tcp_clean_sackreport(tcpb); |
| 1628 |
#ifdef TCPPCAP |
| 1629 |
if (tcp_pcap_aggressive_free) { |
| 1630 |
/* Free the TCP PCAP queues. */ |
| 1631 |
tcp_pcap_drain(&(tcpb->t_inpkts)); |
| 1632 |
tcp_pcap_drain(&(tcpb->t_outpkts)); |
| 1633 |
} |
| 1634 |
#endif |
| 1635 |
} |
| 1636 |
INP_WUNLOCK(inpb); |
| 1637 |
} |
| 1638 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1639 |
CURVNET_RESTORE(); |
| 1640 |
} |
| 1641 |
VNET_LIST_RUNLOCK_NOSLEEP(); |
| 1642 |
} |
| 1643 |
|
| 1644 |
/* |
| 1645 |
* Notify a tcp user of an asynchronous error; |
| 1646 |
* store error as soft error, but wake up user |
| 1647 |
* (for now, won't do anything until can select for soft error). |
| 1648 |
* |
| 1649 |
* Do not wake up user since there currently is no mechanism for |
| 1650 |
* reporting soft errors (yet - a kqueue filter may be added). |
| 1651 |
*/ |
| 1652 |
static struct inpcb * |
| 1653 |
tcp_notify(struct inpcb *inp, int error) |
| 1654 |
{ |
| 1655 |
struct tcpcb *tp; |
| 1656 |
|
| 1657 |
INP_INFO_LOCK_ASSERT(&V_tcbinfo); |
| 1658 |
INP_WLOCK_ASSERT(inp); |
| 1659 |
|
| 1660 |
if ((inp->inp_flags & INP_TIMEWAIT) || |
| 1661 |
(inp->inp_flags & INP_DROPPED)) |
| 1662 |
return (inp); |
| 1663 |
|
| 1664 |
tp = intotcpcb(inp); |
| 1665 |
KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); |
| 1666 |
|
| 1667 |
/* |
| 1668 |
* Ignore some errors if we are hooked up. |
| 1669 |
* If connection hasn't completed, has retransmitted several times, |
| 1670 |
* and receives a second error, give up now. This is better |
| 1671 |
* than waiting a long time to establish a connection that |
| 1672 |
* can never complete. |
| 1673 |
*/ |
| 1674 |
if (tp->t_state == TCPS_ESTABLISHED && |
| 1675 |
(error == EHOSTUNREACH || error == ENETUNREACH || |
| 1676 |
error == EHOSTDOWN)) { |
| 1677 |
if (inp->inp_route.ro_rt) { |
| 1678 |
RTFREE(inp->inp_route.ro_rt); |
| 1679 |
inp->inp_route.ro_rt = (struct rtentry *)NULL; |
| 1680 |
} |
| 1681 |
return (inp); |
| 1682 |
} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && |
| 1683 |
tp->t_softerror) { |
| 1684 |
tp = tcp_drop(tp, error); |
| 1685 |
if (tp != NULL) |
| 1686 |
return (inp); |
| 1687 |
else |
| 1688 |
return (NULL); |
| 1689 |
} else { |
| 1690 |
tp->t_softerror = error; |
| 1691 |
return (inp); |
| 1692 |
} |
| 1693 |
#if 0 |
| 1694 |
wakeup( &so->so_timeo); |
| 1695 |
sorwakeup(so); |
| 1696 |
sowwakeup(so); |
| 1697 |
#endif |
| 1698 |
} |
| 1699 |
|
| 1700 |
static int |
| 1701 |
tcp_pcblist(SYSCTL_HANDLER_ARGS) |
| 1702 |
{ |
| 1703 |
int error, i, m, n, pcb_count; |
| 1704 |
struct inpcb *inp, **inp_list; |
| 1705 |
inp_gen_t gencnt; |
| 1706 |
struct xinpgen xig; |
| 1707 |
|
| 1708 |
/* |
| 1709 |
* The process of preparing the TCB list is too time-consuming and |
| 1710 |
* resource-intensive to repeat twice on every request. |
| 1711 |
*/ |
| 1712 |
if (req->oldptr == NULL) { |
| 1713 |
n = V_tcbinfo.ipi_count + |
| 1714 |
counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); |
| 1715 |
n += imax(n / 8, 10); |
| 1716 |
req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); |
| 1717 |
return (0); |
| 1718 |
} |
| 1719 |
|
| 1720 |
if (req->newptr != NULL) |
| 1721 |
return (EPERM); |
| 1722 |
|
| 1723 |
/* |
| 1724 |
* OK, now we're committed to doing something. |
| 1725 |
*/ |
| 1726 |
INP_LIST_RLOCK(&V_tcbinfo); |
| 1727 |
gencnt = V_tcbinfo.ipi_gencnt; |
| 1728 |
n = V_tcbinfo.ipi_count; |
| 1729 |
INP_LIST_RUNLOCK(&V_tcbinfo); |
| 1730 |
|
| 1731 |
m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); |
| 1732 |
|
| 1733 |
error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) |
| 1734 |
+ (n + m) * sizeof(struct xtcpcb)); |
| 1735 |
if (error != 0) |
| 1736 |
return (error); |
| 1737 |
|
| 1738 |
xig.xig_len = sizeof xig; |
| 1739 |
xig.xig_count = n + m; |
| 1740 |
xig.xig_gen = gencnt; |
| 1741 |
xig.xig_sogen = so_gencnt; |
| 1742 |
error = SYSCTL_OUT(req, &xig, sizeof xig); |
| 1743 |
if (error) |
| 1744 |
return (error); |
| 1745 |
|
| 1746 |
error = syncache_pcblist(req, m, &pcb_count); |
| 1747 |
if (error) |
| 1748 |
return (error); |
| 1749 |
|
| 1750 |
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); |
| 1751 |
|
| 1752 |
INP_INFO_WLOCK(&V_tcbinfo); |
| 1753 |
for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; |
| 1754 |
inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { |
| 1755 |
INP_WLOCK(inp); |
| 1756 |
if (inp->inp_gencnt <= gencnt) { |
| 1757 |
/* |
| 1758 |
* XXX: This use of cr_cansee(), introduced with |
| 1759 |
* TCP state changes, is not quite right, but for |
| 1760 |
* now, better than nothing. |
| 1761 |
*/ |
| 1762 |
if (inp->inp_flags & INP_TIMEWAIT) { |
| 1763 |
if (intotw(inp) != NULL) |
| 1764 |
error = cr_cansee(req->td->td_ucred, |
| 1765 |
intotw(inp)->tw_cred); |
| 1766 |
else |
| 1767 |
error = EINVAL; /* Skip this inp. */ |
| 1768 |
} else |
| 1769 |
error = cr_canseeinpcb(req->td->td_ucred, inp); |
| 1770 |
if (error == 0) { |
| 1771 |
in_pcbref(inp); |
| 1772 |
inp_list[i++] = inp; |
| 1773 |
} |
| 1774 |
} |
| 1775 |
INP_WUNLOCK(inp); |
| 1776 |
} |
| 1777 |
INP_INFO_WUNLOCK(&V_tcbinfo); |
| 1778 |
n = i; |
| 1779 |
|
| 1780 |
error = 0; |
| 1781 |
for (i = 0; i < n; i++) { |
| 1782 |
inp = inp_list[i]; |
| 1783 |
INP_RLOCK(inp); |
| 1784 |
if (inp->inp_gencnt <= gencnt) { |
| 1785 |
struct xtcpcb xt; |
| 1786 |
void *inp_ppcb; |
| 1787 |
|
| 1788 |
bzero(&xt, sizeof(xt)); |
| 1789 |
xt.xt_len = sizeof xt; |
| 1790 |
/* XXX should avoid extra copy */ |
| 1791 |
bcopy(inp, &xt.xt_inp, sizeof *inp); |
| 1792 |
inp_ppcb = inp->inp_ppcb; |
| 1793 |
if (inp_ppcb == NULL) |
| 1794 |
bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); |
| 1795 |
else if (inp->inp_flags & INP_TIMEWAIT) { |
| 1796 |
bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); |
| 1797 |
xt.xt_tp.t_state = TCPS_TIME_WAIT; |
| 1798 |
} else { |
| 1799 |
bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); |
| 1800 |
if (xt.xt_tp.t_timers) |
| 1801 |
tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); |
| 1802 |
} |
| 1803 |
if (inp->inp_socket != NULL) |
| 1804 |
sotoxsocket(inp->inp_socket, &xt.xt_socket); |
| 1805 |
else { |
| 1806 |
bzero(&xt.xt_socket, sizeof xt.xt_socket); |
| 1807 |
xt.xt_socket.xso_protocol = IPPROTO_TCP; |
| 1808 |
} |
| 1809 |
xt.xt_inp.inp_gencnt = inp->inp_gencnt; |
| 1810 |
INP_RUNLOCK(inp); |
| 1811 |
error = SYSCTL_OUT(req, &xt, sizeof xt); |
| 1812 |
} else |
| 1813 |
INP_RUNLOCK(inp); |
| 1814 |
} |
| 1815 |
INP_INFO_RLOCK(&V_tcbinfo); |
| 1816 |
for (i = 0; i < n; i++) { |
| 1817 |
inp = inp_list[i]; |
| 1818 |
INP_RLOCK(inp); |
| 1819 |
if (!in_pcbrele_rlocked(inp)) |
| 1820 |
INP_RUNLOCK(inp); |
| 1821 |
} |
| 1822 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 1823 |
|
| 1824 |
if (!error) { |
| 1825 |
/* |
| 1826 |
* Give the user an updated idea of our state. |
| 1827 |
* If the generation differs from what we told |
| 1828 |
* her before, she knows that something happened |
| 1829 |
* while we were processing this request, and it |
| 1830 |
* might be necessary to retry. |
| 1831 |
*/ |
| 1832 |
INP_LIST_RLOCK(&V_tcbinfo); |
| 1833 |
xig.xig_gen = V_tcbinfo.ipi_gencnt; |
| 1834 |
xig.xig_sogen = so_gencnt; |
| 1835 |
xig.xig_count = V_tcbinfo.ipi_count + pcb_count; |
| 1836 |
INP_LIST_RUNLOCK(&V_tcbinfo); |
| 1837 |
error = SYSCTL_OUT(req, &xig, sizeof xig); |
| 1838 |
} |
| 1839 |
free(inp_list, M_TEMP); |
| 1840 |
return (error); |
| 1841 |
} |
| 1842 |
|
| 1843 |
SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, |
| 1844 |
CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, |
| 1845 |
tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); |
| 1846 |
|
| 1847 |
#ifdef INET |
| 1848 |
static int |
| 1849 |
tcp_getcred(SYSCTL_HANDLER_ARGS) |
| 1850 |
{ |
| 1851 |
struct xucred xuc; |
| 1852 |
struct sockaddr_in addrs[2]; |
| 1853 |
struct inpcb *inp; |
| 1854 |
int error; |
| 1855 |
|
| 1856 |
error = priv_check(req->td, PRIV_NETINET_GETCRED); |
| 1857 |
if (error) |
| 1858 |
return (error); |
| 1859 |
error = SYSCTL_IN(req, addrs, sizeof(addrs)); |
| 1860 |
if (error) |
| 1861 |
return (error); |
| 1862 |
inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, |
| 1863 |
addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); |
| 1864 |
if (inp != NULL) { |
| 1865 |
if (inp->inp_socket == NULL) |
| 1866 |
error = ENOENT; |
| 1867 |
if (error == 0) |
| 1868 |
error = cr_canseeinpcb(req->td->td_ucred, inp); |
| 1869 |
if (error == 0) |
| 1870 |
cru2x(inp->inp_cred, &xuc); |
| 1871 |
INP_RUNLOCK(inp); |
| 1872 |
} else |
| 1873 |
error = ENOENT; |
| 1874 |
if (error == 0) |
| 1875 |
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); |
| 1876 |
return (error); |
| 1877 |
} |
| 1878 |
|
| 1879 |
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, |
| 1880 |
CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, |
| 1881 |
tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); |
| 1882 |
#endif /* INET */ |
| 1883 |
|
| 1884 |
#ifdef INET6 |
| 1885 |
static int |
| 1886 |
tcp6_getcred(SYSCTL_HANDLER_ARGS) |
| 1887 |
{ |
| 1888 |
struct xucred xuc; |
| 1889 |
struct sockaddr_in6 addrs[2]; |
| 1890 |
struct inpcb *inp; |
| 1891 |
int error; |
| 1892 |
#ifdef INET |
| 1893 |
int mapped = 0; |
| 1894 |
#endif |
| 1895 |
|
| 1896 |
error = priv_check(req->td, PRIV_NETINET_GETCRED); |
| 1897 |
if (error) |
| 1898 |
return (error); |
| 1899 |
error = SYSCTL_IN(req, addrs, sizeof(addrs)); |
| 1900 |
if (error) |
| 1901 |
return (error); |
| 1902 |
if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || |
| 1903 |
(error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { |
| 1904 |
return (error); |
| 1905 |
} |
| 1906 |
if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { |
| 1907 |
#ifdef INET |
| 1908 |
if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) |
| 1909 |
mapped = 1; |
| 1910 |
else |
| 1911 |
#endif |
| 1912 |
return (EINVAL); |
| 1913 |
} |
| 1914 |
|
| 1915 |
#ifdef INET |
| 1916 |
if (mapped == 1) |
| 1917 |
inp = in_pcblookup(&V_tcbinfo, |
| 1918 |
*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], |
| 1919 |
addrs[1].sin6_port, |
| 1920 |
*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], |
| 1921 |
addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); |
| 1922 |
else |
| 1923 |
#endif |
| 1924 |
inp = in6_pcblookup(&V_tcbinfo, |
| 1925 |
&addrs[1].sin6_addr, addrs[1].sin6_port, |
| 1926 |
&addrs[0].sin6_addr, addrs[0].sin6_port, |
| 1927 |
INPLOOKUP_RLOCKPCB, NULL); |
| 1928 |
if (inp != NULL) { |
| 1929 |
if (inp->inp_socket == NULL) |
| 1930 |
error = ENOENT; |
| 1931 |
if (error == 0) |
| 1932 |
error = cr_canseeinpcb(req->td->td_ucred, inp); |
| 1933 |
if (error == 0) |
| 1934 |
cru2x(inp->inp_cred, &xuc); |
| 1935 |
INP_RUNLOCK(inp); |
| 1936 |
} else |
| 1937 |
error = ENOENT; |
| 1938 |
if (error == 0) |
| 1939 |
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); |
| 1940 |
return (error); |
| 1941 |
} |
| 1942 |
|
| 1943 |
SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, |
| 1944 |
CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, |
| 1945 |
tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); |
| 1946 |
#endif /* INET6 */ |
| 1947 |
|
| 1948 |
|
| 1949 |
#ifdef INET |
| 1950 |
void |
| 1951 |
tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) |
| 1952 |
{ |
| 1953 |
struct ip *ip = vip; |
| 1954 |
struct tcphdr *th; |
| 1955 |
struct in_addr faddr; |
| 1956 |
struct inpcb *inp; |
| 1957 |
struct tcpcb *tp; |
| 1958 |
struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; |
| 1959 |
struct icmp *icp; |
| 1960 |
struct in_conninfo inc; |
| 1961 |
tcp_seq icmp_tcp_seq; |
| 1962 |
int mtu; |
| 1963 |
|
| 1964 |
faddr = ((struct sockaddr_in *)sa)->sin_addr; |
| 1965 |
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) |
| 1966 |
return; |
| 1967 |
|
| 1968 |
if (cmd == PRC_MSGSIZE) |
| 1969 |
notify = tcp_mtudisc_notify; |
| 1970 |
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || |
| 1971 |
cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || |
| 1972 |
cmd == PRC_TIMXCEED_INTRANS) && ip) |
| 1973 |
notify = tcp_drop_syn_sent; |
| 1974 |
|
| 1975 |
/* |
| 1976 |
* Hostdead is ugly because it goes linearly through all PCBs. |
| 1977 |
* XXX: We never get this from ICMP, otherwise it makes an |
| 1978 |
* excellent DoS attack on machines with many connections. |
| 1979 |
*/ |
| 1980 |
else if (cmd == PRC_HOSTDEAD) |
| 1981 |
ip = NULL; |
| 1982 |
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) |
| 1983 |
return; |
| 1984 |
|
| 1985 |
if (ip == NULL) { |
| 1986 |
in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); |
| 1987 |
return; |
| 1988 |
} |
| 1989 |
|
| 1990 |
icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); |
| 1991 |
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); |
| 1992 |
INP_INFO_RLOCK(&V_tcbinfo); |
| 1993 |
inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, |
| 1994 |
th->th_sport, INPLOOKUP_WLOCKPCB, NULL); |
| 1995 |
if (inp != NULL && PRC_IS_REDIRECT(cmd)) { |
| 1996 |
/* signal EHOSTDOWN, as it flushes the cached route */ |
| 1997 |
inp = (*notify)(inp, EHOSTDOWN); |
| 1998 |
if (inp != NULL) |
| 1999 |
INP_WUNLOCK(inp); |
| 2000 |
} else if (inp != NULL) { |
| 2001 |
if (!(inp->inp_flags & INP_TIMEWAIT) && |
| 2002 |
!(inp->inp_flags & INP_DROPPED) && |
| 2003 |
!(inp->inp_socket == NULL)) { |
| 2004 |
icmp_tcp_seq = ntohl(th->th_seq); |
| 2005 |
tp = intotcpcb(inp); |
| 2006 |
if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && |
| 2007 |
SEQ_LT(icmp_tcp_seq, tp->snd_max)) { |
| 2008 |
if (cmd == PRC_MSGSIZE) { |
| 2009 |
/* |
| 2010 |
* MTU discovery: |
| 2011 |
* If we got a needfrag set the MTU |
| 2012 |
* in the route to the suggested new |
| 2013 |
* value (if given) and then notify. |
| 2014 |
*/ |
| 2015 |
mtu = ntohs(icp->icmp_nextmtu); |
| 2016 |
/* |
| 2017 |
* If no alternative MTU was |
| 2018 |
* proposed, try the next smaller |
| 2019 |
* one. |
| 2020 |
*/ |
| 2021 |
if (!mtu) |
| 2022 |
mtu = ip_next_mtu( |
| 2023 |
ntohs(ip->ip_len), 1); |
| 2024 |
if (mtu < V_tcp_minmss + |
| 2025 |
sizeof(struct tcpiphdr)) |
| 2026 |
mtu = V_tcp_minmss + |
| 2027 |
sizeof(struct tcpiphdr); |
| 2028 |
/* |
| 2029 |
* Only process the offered MTU if it |
| 2030 |
* is smaller than the current one. |
| 2031 |
*/ |
| 2032 |
if (mtu < tp->t_maxseg + |
| 2033 |
sizeof(struct tcpiphdr)) { |
| 2034 |
bzero(&inc, sizeof(inc)); |
| 2035 |
inc.inc_faddr = faddr; |
| 2036 |
inc.inc_fibnum = |
| 2037 |
inp->inp_inc.inc_fibnum; |
| 2038 |
tcp_hc_updatemtu(&inc, mtu); |
| 2039 |
tcp_mtudisc(inp, mtu); |
| 2040 |
} |
| 2041 |
} else |
| 2042 |
inp = (*notify)(inp, |
| 2043 |
inetctlerrmap[cmd]); |
| 2044 |
} |
| 2045 |
} |
| 2046 |
if (inp != NULL) |
| 2047 |
INP_WUNLOCK(inp); |
| 2048 |
} else { |
| 2049 |
bzero(&inc, sizeof(inc)); |
| 2050 |
inc.inc_fport = th->th_dport; |
| 2051 |
inc.inc_lport = th->th_sport; |
| 2052 |
inc.inc_faddr = faddr; |
| 2053 |
inc.inc_laddr = ip->ip_src; |
| 2054 |
syncache_unreach(&inc, th); |
| 2055 |
} |
| 2056 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 2057 |
} |
| 2058 |
#endif /* INET */ |
| 2059 |
|
| 2060 |
#ifdef INET6 |
| 2061 |
void |
| 2062 |
tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) |
| 2063 |
{ |
| 2064 |
struct in6_addr *dst; |
| 2065 |
struct tcphdr *th; |
| 2066 |
struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; |
| 2067 |
struct ip6_hdr *ip6; |
| 2068 |
struct mbuf *m; |
| 2069 |
struct inpcb *inp; |
| 2070 |
struct tcpcb *tp; |
| 2071 |
struct icmp6_hdr *icmp6; |
| 2072 |
struct ip6ctlparam *ip6cp = NULL; |
| 2073 |
const struct sockaddr_in6 *sa6_src = NULL; |
| 2074 |
struct in_conninfo inc; |
| 2075 |
tcp_seq icmp_tcp_seq; |
| 2076 |
unsigned int mtu; |
| 2077 |
unsigned int off; |
| 2078 |
|
| 2079 |
|
| 2080 |
if (sa->sa_family != AF_INET6 || |
| 2081 |
sa->sa_len != sizeof(struct sockaddr_in6)) |
| 2082 |
return; |
| 2083 |
|
| 2084 |
/* if the parameter is from icmp6, decode it. */ |
| 2085 |
if (d != NULL) { |
| 2086 |
ip6cp = (struct ip6ctlparam *)d; |
| 2087 |
icmp6 = ip6cp->ip6c_icmp6; |
| 2088 |
m = ip6cp->ip6c_m; |
| 2089 |
ip6 = ip6cp->ip6c_ip6; |
| 2090 |
off = ip6cp->ip6c_off; |
| 2091 |
sa6_src = ip6cp->ip6c_src; |
| 2092 |
dst = ip6cp->ip6c_finaldst; |
| 2093 |
} else { |
| 2094 |
m = NULL; |
| 2095 |
ip6 = NULL; |
| 2096 |
off = 0; /* fool gcc */ |
| 2097 |
sa6_src = &sa6_any; |
| 2098 |
dst = NULL; |
| 2099 |
} |
| 2100 |
|
| 2101 |
if (cmd == PRC_MSGSIZE) |
| 2102 |
notify = tcp_mtudisc_notify; |
| 2103 |
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || |
| 2104 |
cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || |
| 2105 |
cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) |
| 2106 |
notify = tcp_drop_syn_sent; |
| 2107 |
|
| 2108 |
/* |
| 2109 |
* Hostdead is ugly because it goes linearly through all PCBs. |
| 2110 |
* XXX: We never get this from ICMP, otherwise it makes an |
| 2111 |
* excellent DoS attack on machines with many connections. |
| 2112 |
*/ |
| 2113 |
else if (cmd == PRC_HOSTDEAD) |
| 2114 |
ip6 = NULL; |
| 2115 |
else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) |
| 2116 |
return; |
| 2117 |
|
| 2118 |
if (ip6 == NULL) { |
| 2119 |
in6_pcbnotify(&V_tcbinfo, sa, 0, |
| 2120 |
(const struct sockaddr *)sa6_src, |
| 2121 |
0, cmd, NULL, notify); |
| 2122 |
return; |
| 2123 |
} |
| 2124 |
|
| 2125 |
/* Check if we can safely get the ports from the tcp hdr */ |
| 2126 |
if (m == NULL || |
| 2127 |
(m->m_pkthdr.len < |
| 2128 |
(int32_t) (off + offsetof(struct tcphdr, th_seq)))) { |
| 2129 |
return; |
| 2130 |
} |
| 2131 |
|
| 2132 |
th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off); |
| 2133 |
INP_INFO_RLOCK(&V_tcbinfo); |
| 2134 |
inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport, |
| 2135 |
&ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); |
| 2136 |
if (inp != NULL && PRC_IS_REDIRECT(cmd)) { |
| 2137 |
/* signal EHOSTDOWN, as it flushes the cached route */ |
| 2138 |
inp = (*notify)(inp, EHOSTDOWN); |
| 2139 |
if (inp != NULL) |
| 2140 |
INP_WUNLOCK(inp); |
| 2141 |
} else if (inp != NULL) { |
| 2142 |
if (!(inp->inp_flags & INP_TIMEWAIT) && |
| 2143 |
!(inp->inp_flags & INP_DROPPED) && |
| 2144 |
!(inp->inp_socket == NULL)) { |
| 2145 |
icmp_tcp_seq = ntohl(th->th_seq); |
| 2146 |
tp = intotcpcb(inp); |
| 2147 |
if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && |
| 2148 |
SEQ_LT(icmp_tcp_seq, tp->snd_max)) { |
| 2149 |
if (cmd == PRC_MSGSIZE) { |
| 2150 |
/* |
| 2151 |
* MTU discovery: |
| 2152 |
* If we got a needfrag set the MTU |
| 2153 |
* in the route to the suggested new |
| 2154 |
* value (if given) and then notify. |
| 2155 |
*/ |
| 2156 |
mtu = ntohl(icmp6->icmp6_mtu); |
| 2157 |
/* |
| 2158 |
* If no alternative MTU was |
| 2159 |
* proposed, or the proposed |
| 2160 |
* MTU was too small, set to |
| 2161 |
* the min. |
| 2162 |
*/ |
| 2163 |
if (mtu < IPV6_MMTU) |
| 2164 |
mtu = IPV6_MMTU - 8; |
| 2165 |
|
| 2166 |
|
| 2167 |
bzero(&inc, sizeof(inc)); |
| 2168 |
inc.inc_fibnum = M_GETFIB(m); |
| 2169 |
inc.inc_flags |= INC_ISIPV6; |
| 2170 |
inc.inc6_faddr = *dst; |
| 2171 |
if (in6_setscope(&inc.inc6_faddr, |
| 2172 |
m->m_pkthdr.rcvif, NULL)) |
| 2173 |
goto unlock_inp; |
| 2174 |
|
| 2175 |
/* |
| 2176 |
* Only process the offered MTU if it |
| 2177 |
* is smaller than the current one. |
| 2178 |
*/ |
| 2179 |
if (mtu < tp->t_maxseg + |
| 2180 |
(sizeof (*th) + sizeof (*ip6))) { |
| 2181 |
tcp_hc_updatemtu(&inc, mtu); |
| 2182 |
tcp_mtudisc(inp, mtu); |
| 2183 |
ICMP6STAT_INC(icp6s_pmtuchg); |
| 2184 |
} |
| 2185 |
} else |
| 2186 |
inp = (*notify)(inp, |
| 2187 |
inet6ctlerrmap[cmd]); |
| 2188 |
} |
| 2189 |
} |
| 2190 |
unlock_inp: |
| 2191 |
if (inp != NULL) |
| 2192 |
INP_WUNLOCK(inp); |
| 2193 |
} else { |
| 2194 |
bzero(&inc, sizeof(inc)); |
| 2195 |
inc.inc_fibnum = M_GETFIB(m); |
| 2196 |
inc.inc_flags |= INC_ISIPV6; |
| 2197 |
inc.inc_fport = th->th_dport; |
| 2198 |
inc.inc_lport = th->th_sport; |
| 2199 |
inc.inc6_faddr = *dst; |
| 2200 |
inc.inc6_laddr = ip6->ip6_src; |
| 2201 |
syncache_unreach(&inc, th); |
| 2202 |
} |
| 2203 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 2204 |
} |
| 2205 |
#endif /* INET6 */ |
| 2206 |
|
| 2207 |
|
| 2208 |
/* |
| 2209 |
* Following is where TCP initial sequence number generation occurs. |
| 2210 |
* |
| 2211 |
* There are two places where we must use initial sequence numbers: |
| 2212 |
* 1. In SYN-ACK packets. |
| 2213 |
* 2. In SYN packets. |
| 2214 |
* |
| 2215 |
* All ISNs for SYN-ACK packets are generated by the syncache. See |
| 2216 |
* tcp_syncache.c for details. |
| 2217 |
* |
| 2218 |
* The ISNs in SYN packets must be monotonic; TIME_WAIT recycling |
| 2219 |
* depends on this property. In addition, these ISNs should be |
| 2220 |
* unguessable so as to prevent connection hijacking. To satisfy |
| 2221 |
* the requirements of this situation, the algorithm outlined in |
| 2222 |
* RFC 1948 is used, with only small modifications. |
| 2223 |
* |
| 2224 |
* Implementation details: |
| 2225 |
* |
| 2226 |
* Time is based off the system timer, and is corrected so that it |
| 2227 |
* increases by one megabyte per second. This allows for proper |
| 2228 |
* recycling on high speed LANs while still leaving over an hour |
| 2229 |
* before rollover. |
| 2230 |
* |
| 2231 |
* As reading the *exact* system time is too expensive to be done |
| 2232 |
* whenever setting up a TCP connection, we increment the time |
| 2233 |
* offset in two ways. First, a small random positive increment |
| 2234 |
* is added to isn_offset for each connection that is set up. |
| 2235 |
* Second, the function tcp_isn_tick fires once per clock tick |
| 2236 |
* and increments isn_offset as necessary so that sequence numbers |
| 2237 |
* are incremented at approximately ISN_BYTES_PER_SECOND. The |
| 2238 |
* random positive increments serve only to ensure that the same |
| 2239 |
* exact sequence number is never sent out twice (as could otherwise |
| 2240 |
* happen when a port is recycled in less than the system tick |
| 2241 |
* interval.) |
| 2242 |
* |
| 2243 |
* net.inet.tcp.isn_reseed_interval controls the number of seconds |
| 2244 |
* between seeding of isn_secret. This is normally set to zero, |
| 2245 |
* as reseeding should not be necessary. |
| 2246 |
* |
| 2247 |
* Locking of the global variables isn_secret, isn_last_reseed, isn_offset, |
| 2248 |
* isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In |
| 2249 |
* general, this means holding an exclusive (write) lock. |
| 2250 |
*/ |
| 2251 |
|
| 2252 |
#define ISN_BYTES_PER_SECOND 1048576 |
| 2253 |
#define ISN_STATIC_INCREMENT 4096 |
| 2254 |
#define ISN_RANDOM_INCREMENT (4096 - 1) |
| 2255 |
|
| 2256 |
static VNET_DEFINE(u_char, isn_secret[32]); |
| 2257 |
static VNET_DEFINE(int, isn_last); |
| 2258 |
static VNET_DEFINE(int, isn_last_reseed); |
| 2259 |
static VNET_DEFINE(u_int32_t, isn_offset); |
| 2260 |
static VNET_DEFINE(u_int32_t, isn_offset_old); |
| 2261 |
|
| 2262 |
#define V_isn_secret VNET(isn_secret) |
| 2263 |
#define V_isn_last VNET(isn_last) |
| 2264 |
#define V_isn_last_reseed VNET(isn_last_reseed) |
| 2265 |
#define V_isn_offset VNET(isn_offset) |
| 2266 |
#define V_isn_offset_old VNET(isn_offset_old) |
| 2267 |
|
| 2268 |
tcp_seq |
| 2269 |
tcp_new_isn(struct tcpcb *tp) |
| 2270 |
{ |
| 2271 |
MD5_CTX isn_ctx; |
| 2272 |
u_int32_t md5_buffer[4]; |
| 2273 |
tcp_seq new_isn; |
| 2274 |
u_int32_t projected_offset; |
| 2275 |
|
| 2276 |
INP_WLOCK_ASSERT(tp->t_inpcb); |
| 2277 |
|
| 2278 |
ISN_LOCK(); |
| 2279 |
/* Seed if this is the first use, reseed if requested. */ |
| 2280 |
if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && |
| 2281 |
(((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) |
| 2282 |
< (u_int)ticks))) { |
| 2283 |
read_random(&V_isn_secret, sizeof(V_isn_secret)); |
| 2284 |
V_isn_last_reseed = ticks; |
| 2285 |
} |
| 2286 |
|
| 2287 |
/* Compute the md5 hash and return the ISN. */ |
| 2288 |
MD5Init(&isn_ctx); |
| 2289 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); |
| 2290 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); |
| 2291 |
#ifdef INET6 |
| 2292 |
if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { |
| 2293 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, |
| 2294 |
sizeof(struct in6_addr)); |
| 2295 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, |
| 2296 |
sizeof(struct in6_addr)); |
| 2297 |
} else |
| 2298 |
#endif |
| 2299 |
{ |
| 2300 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, |
| 2301 |
sizeof(struct in_addr)); |
| 2302 |
MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, |
| 2303 |
sizeof(struct in_addr)); |
| 2304 |
} |
| 2305 |
MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); |
| 2306 |
MD5Final((u_char *) &md5_buffer, &isn_ctx); |
| 2307 |
new_isn = (tcp_seq) md5_buffer[0]; |
| 2308 |
V_isn_offset += ISN_STATIC_INCREMENT + |
| 2309 |
(arc4random() & ISN_RANDOM_INCREMENT); |
| 2310 |
if (ticks != V_isn_last) { |
| 2311 |
projected_offset = V_isn_offset_old + |
| 2312 |
ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); |
| 2313 |
if (SEQ_GT(projected_offset, V_isn_offset)) |
| 2314 |
V_isn_offset = projected_offset; |
| 2315 |
V_isn_offset_old = V_isn_offset; |
| 2316 |
V_isn_last = ticks; |
| 2317 |
} |
| 2318 |
new_isn += V_isn_offset; |
| 2319 |
ISN_UNLOCK(); |
| 2320 |
return (new_isn); |
| 2321 |
} |
| 2322 |
|
| 2323 |
/* |
| 2324 |
* When a specific ICMP unreachable message is received and the |
| 2325 |
* connection state is SYN-SENT, drop the connection. This behavior |
| 2326 |
* is controlled by the icmp_may_rst sysctl. |
| 2327 |
*/ |
| 2328 |
struct inpcb * |
| 2329 |
tcp_drop_syn_sent(struct inpcb *inp, int errno) |
| 2330 |
{ |
| 2331 |
struct tcpcb *tp; |
| 2332 |
|
| 2333 |
INP_INFO_RLOCK_ASSERT(&V_tcbinfo); |
| 2334 |
INP_WLOCK_ASSERT(inp); |
| 2335 |
|
| 2336 |
if ((inp->inp_flags & INP_TIMEWAIT) || |
| 2337 |
(inp->inp_flags & INP_DROPPED)) |
| 2338 |
return (inp); |
| 2339 |
|
| 2340 |
tp = intotcpcb(inp); |
| 2341 |
if (tp->t_state != TCPS_SYN_SENT) |
| 2342 |
return (inp); |
| 2343 |
|
| 2344 |
tp = tcp_drop(tp, errno); |
| 2345 |
if (tp != NULL) |
| 2346 |
return (inp); |
| 2347 |
else |
| 2348 |
return (NULL); |
| 2349 |
} |
| 2350 |
|
| 2351 |
/* |
| 2352 |
* When `need fragmentation' ICMP is received, update our idea of the MSS |
| 2353 |
* based on the new value. Also nudge TCP to send something, since we |
| 2354 |
* know the packet we just sent was dropped. |
| 2355 |
* This duplicates some code in the tcp_mss() function in tcp_input.c. |
| 2356 |
*/ |
| 2357 |
static struct inpcb * |
| 2358 |
tcp_mtudisc_notify(struct inpcb *inp, int error) |
| 2359 |
{ |
| 2360 |
|
| 2361 |
tcp_mtudisc(inp, -1); |
| 2362 |
return (inp); |
| 2363 |
} |
| 2364 |
|
| 2365 |
static void |
| 2366 |
tcp_mtudisc(struct inpcb *inp, int mtuoffer) |
| 2367 |
{ |
| 2368 |
struct tcpcb *tp; |
| 2369 |
struct socket *so; |
| 2370 |
|
| 2371 |
INP_WLOCK_ASSERT(inp); |
| 2372 |
if ((inp->inp_flags & INP_TIMEWAIT) || |
| 2373 |
(inp->inp_flags & INP_DROPPED)) |
| 2374 |
return; |
| 2375 |
|
| 2376 |
tp = intotcpcb(inp); |
| 2377 |
KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); |
| 2378 |
|
| 2379 |
tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); |
| 2380 |
|
| 2381 |
so = inp->inp_socket; |
| 2382 |
SOCKBUF_LOCK(&so->so_snd); |
| 2383 |
/* If the mss is larger than the socket buffer, decrease the mss. */ |
| 2384 |
if (so->so_snd.sb_hiwat < tp->t_maxseg) |
| 2385 |
tp->t_maxseg = so->so_snd.sb_hiwat; |
| 2386 |
SOCKBUF_UNLOCK(&so->so_snd); |
| 2387 |
|
| 2388 |
TCPSTAT_INC(tcps_mturesent); |
| 2389 |
tp->t_rtttime = 0; |
| 2390 |
tp->snd_nxt = tp->snd_una; |
| 2391 |
tcp_free_sackholes(tp); |
| 2392 |
tp->snd_recover = tp->snd_max; |
| 2393 |
if (tp->t_flags & TF_SACK_PERMIT) |
| 2394 |
EXIT_FASTRECOVERY(tp->t_flags); |
| 2395 |
tp->t_fb->tfb_tcp_output(tp); |
| 2396 |
} |
| 2397 |
|
| 2398 |
#ifdef INET |
| 2399 |
/* |
| 2400 |
* Look-up the routing entry to the peer of this inpcb. If no route |
| 2401 |
* is found and it cannot be allocated, then return 0. This routine |
| 2402 |
* is called by TCP routines that access the rmx structure and by |
| 2403 |
* tcp_mss_update to get the peer/interface MTU. |
| 2404 |
*/ |
| 2405 |
uint32_t |
| 2406 |
tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) |
| 2407 |
{ |
| 2408 |
struct nhop4_extended nh4; |
| 2409 |
struct ifnet *ifp; |
| 2410 |
uint32_t maxmtu = 0; |
| 2411 |
|
| 2412 |
KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); |
| 2413 |
|
| 2414 |
if (inc->inc_faddr.s_addr != INADDR_ANY) { |
| 2415 |
|
| 2416 |
if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, |
| 2417 |
NHR_REF, 0, &nh4) != 0) |
| 2418 |
return (0); |
| 2419 |
|
| 2420 |
ifp = nh4.nh_ifp; |
| 2421 |
maxmtu = nh4.nh_mtu; |
| 2422 |
|
| 2423 |
/* Report additional interface capabilities. */ |
| 2424 |
if (cap != NULL) { |
| 2425 |
if (ifp->if_capenable & IFCAP_TSO4 && |
| 2426 |
ifp->if_hwassist & CSUM_TSO) { |
| 2427 |
cap->ifcap |= CSUM_TSO; |
| 2428 |
cap->tsomax = ifp->if_hw_tsomax; |
| 2429 |
cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; |
| 2430 |
cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; |
| 2431 |
} |
| 2432 |
} |
| 2433 |
fib4_free_nh_ext(inc->inc_fibnum, &nh4); |
| 2434 |
} |
| 2435 |
return (maxmtu); |
| 2436 |
} |
| 2437 |
#endif /* INET */ |
| 2438 |
|
| 2439 |
#ifdef INET6 |
| 2440 |
uint32_t |
| 2441 |
tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) |
| 2442 |
{ |
| 2443 |
struct nhop6_extended nh6; |
| 2444 |
struct in6_addr dst6; |
| 2445 |
uint32_t scopeid; |
| 2446 |
struct ifnet *ifp; |
| 2447 |
uint32_t maxmtu = 0; |
| 2448 |
|
| 2449 |
KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); |
| 2450 |
|
| 2451 |
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { |
| 2452 |
in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); |
| 2453 |
if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, |
| 2454 |
0, &nh6) != 0) |
| 2455 |
return (0); |
| 2456 |
|
| 2457 |
ifp = nh6.nh_ifp; |
| 2458 |
maxmtu = nh6.nh_mtu; |
| 2459 |
|
| 2460 |
/* Report additional interface capabilities. */ |
| 2461 |
if (cap != NULL) { |
| 2462 |
if (ifp->if_capenable & IFCAP_TSO6 && |
| 2463 |
ifp->if_hwassist & CSUM_TSO) { |
| 2464 |
cap->ifcap |= CSUM_TSO; |
| 2465 |
cap->tsomax = ifp->if_hw_tsomax; |
| 2466 |
cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; |
| 2467 |
cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; |
| 2468 |
} |
| 2469 |
} |
| 2470 |
fib6_free_nh_ext(inc->inc_fibnum, &nh6); |
| 2471 |
} |
| 2472 |
|
| 2473 |
return (maxmtu); |
| 2474 |
} |
| 2475 |
#endif /* INET6 */ |
| 2476 |
|
| 2477 |
/* |
| 2478 |
* Calculate effective SMSS per RFC5681 definition for a given TCP |
| 2479 |
* connection at its current state, taking into account SACK and etc. |
| 2480 |
*/ |
| 2481 |
u_int |
| 2482 |
tcp_maxseg(const struct tcpcb *tp) |
| 2483 |
{ |
| 2484 |
u_int optlen; |
| 2485 |
|
| 2486 |
if (tp->t_flags & TF_NOOPT) |
| 2487 |
return (tp->t_maxseg); |
| 2488 |
|
| 2489 |
/* |
| 2490 |
* Here we have a simplified code from tcp_addoptions(), |
| 2491 |
* without a proper loop, and having most of paddings hardcoded. |
| 2492 |
* We might make mistakes with padding here in some edge cases, |
| 2493 |
* but this is harmless, since result of tcp_maxseg() is used |
| 2494 |
* only in cwnd and ssthresh estimations. |
| 2495 |
*/ |
| 2496 |
#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) |
| 2497 |
if (TCPS_HAVEESTABLISHED(tp->t_state)) { |
| 2498 |
if (tp->t_flags & TF_RCVD_TSTMP) |
| 2499 |
optlen = TCPOLEN_TSTAMP_APPA; |
| 2500 |
else |
| 2501 |
optlen = 0; |
| 2502 |
#ifdef TCP_SIGNATURE |
| 2503 |
if (tp->t_flags & TF_SIGNATURE) |
| 2504 |
optlen += PAD(TCPOLEN_SIGNATURE); |
| 2505 |
#endif |
| 2506 |
if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { |
| 2507 |
optlen += TCPOLEN_SACKHDR; |
| 2508 |
optlen += tp->rcv_numsacks * TCPOLEN_SACK; |
| 2509 |
optlen = PAD(optlen); |
| 2510 |
} |
| 2511 |
} else { |
| 2512 |
if (tp->t_flags & TF_REQ_TSTMP) |
| 2513 |
optlen = TCPOLEN_TSTAMP_APPA; |
| 2514 |
else |
| 2515 |
optlen = PAD(TCPOLEN_MAXSEG); |
| 2516 |
if (tp->t_flags & TF_REQ_SCALE) |
| 2517 |
optlen += PAD(TCPOLEN_WINDOW); |
| 2518 |
#ifdef TCP_SIGNATURE |
| 2519 |
if (tp->t_flags & TF_SIGNATURE) |
| 2520 |
optlen += PAD(TCPOLEN_SIGNATURE); |
| 2521 |
#endif |
| 2522 |
if (tp->t_flags & TF_SACK_PERMIT) |
| 2523 |
optlen += PAD(TCPOLEN_SACK_PERMITTED); |
| 2524 |
} |
| 2525 |
#undef PAD |
| 2526 |
optlen = min(optlen, TCP_MAXOLEN); |
| 2527 |
return (tp->t_maxseg - optlen); |
| 2528 |
} |
| 2529 |
|
| 2530 |
#ifdef IPSEC |
| 2531 |
/* compute ESP/AH header size for TCP, including outer IP header. */ |
| 2532 |
size_t |
| 2533 |
ipsec_hdrsiz_tcp(struct tcpcb *tp) |
| 2534 |
{ |
| 2535 |
struct inpcb *inp; |
| 2536 |
struct mbuf *m; |
| 2537 |
size_t hdrsiz; |
| 2538 |
struct ip *ip; |
| 2539 |
#ifdef INET6 |
| 2540 |
struct ip6_hdr *ip6; |
| 2541 |
#endif |
| 2542 |
struct tcphdr *th; |
| 2543 |
|
| 2544 |
if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || |
| 2545 |
(!key_havesp(IPSEC_DIR_OUTBOUND))) |
| 2546 |
return (0); |
| 2547 |
m = m_gethdr(M_NOWAIT, MT_DATA); |
| 2548 |
if (!m) |
| 2549 |
return (0); |
| 2550 |
|
| 2551 |
#ifdef INET6 |
| 2552 |
if ((inp->inp_vflag & INP_IPV6) != 0) { |
| 2553 |
ip6 = mtod(m, struct ip6_hdr *); |
| 2554 |
th = (struct tcphdr *)(ip6 + 1); |
| 2555 |
m->m_pkthdr.len = m->m_len = |
| 2556 |
sizeof(struct ip6_hdr) + sizeof(struct tcphdr); |
| 2557 |
tcpip_fillheaders(inp, ip6, th); |
| 2558 |
hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); |
| 2559 |
} else |
| 2560 |
#endif /* INET6 */ |
| 2561 |
{ |
| 2562 |
ip = mtod(m, struct ip *); |
| 2563 |
th = (struct tcphdr *)(ip + 1); |
| 2564 |
m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); |
| 2565 |
tcpip_fillheaders(inp, ip, th); |
| 2566 |
hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); |
| 2567 |
} |
| 2568 |
|
| 2569 |
m_free(m); |
| 2570 |
return (hdrsiz); |
| 2571 |
} |
| 2572 |
#endif /* IPSEC */ |
| 2573 |
|
| 2574 |
#ifdef TCP_SIGNATURE |
| 2575 |
/* |
| 2576 |
* Callback function invoked by m_apply() to digest TCP segment data |
| 2577 |
* contained within an mbuf chain. |
| 2578 |
*/ |
| 2579 |
static int |
| 2580 |
tcp_signature_apply(void *fstate, void *data, u_int len) |
| 2581 |
{ |
| 2582 |
|
| 2583 |
MD5Update(fstate, (u_char *)data, len); |
| 2584 |
return (0); |
| 2585 |
} |
| 2586 |
|
| 2587 |
/* |
| 2588 |
* XXX The key is retrieved from the system's PF_KEY SADB, by keying a |
| 2589 |
* search with the destination IP address, and a 'magic SPI' to be |
| 2590 |
* determined by the application. This is hardcoded elsewhere to 1179 |
| 2591 |
*/ |
| 2592 |
struct secasvar * |
| 2593 |
tcp_get_sav(struct mbuf *m, u_int direction) |
| 2594 |
{ |
| 2595 |
union sockaddr_union dst; |
| 2596 |
struct secasvar *sav; |
| 2597 |
struct ip *ip; |
| 2598 |
#ifdef INET6 |
| 2599 |
struct ip6_hdr *ip6; |
| 2600 |
char ip6buf[INET6_ADDRSTRLEN]; |
| 2601 |
#endif |
| 2602 |
|
| 2603 |
/* Extract the destination from the IP header in the mbuf. */ |
| 2604 |
bzero(&dst, sizeof(union sockaddr_union)); |
| 2605 |
ip = mtod(m, struct ip *); |
| 2606 |
#ifdef INET6 |
| 2607 |
ip6 = NULL; /* Make the compiler happy. */ |
| 2608 |
#endif |
| 2609 |
switch (ip->ip_v) { |
| 2610 |
#ifdef INET |
| 2611 |
case IPVERSION: |
| 2612 |
dst.sa.sa_len = sizeof(struct sockaddr_in); |
| 2613 |
dst.sa.sa_family = AF_INET; |
| 2614 |
dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? |
| 2615 |
ip->ip_src : ip->ip_dst; |
| 2616 |
break; |
| 2617 |
#endif |
| 2618 |
#ifdef INET6 |
| 2619 |
case (IPV6_VERSION >> 4): |
| 2620 |
ip6 = mtod(m, struct ip6_hdr *); |
| 2621 |
dst.sa.sa_len = sizeof(struct sockaddr_in6); |
| 2622 |
dst.sa.sa_family = AF_INET6; |
| 2623 |
dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? |
| 2624 |
ip6->ip6_src : ip6->ip6_dst; |
| 2625 |
break; |
| 2626 |
#endif |
| 2627 |
default: |
| 2628 |
return (NULL); |
| 2629 |
/* NOTREACHED */ |
| 2630 |
break; |
| 2631 |
} |
| 2632 |
|
| 2633 |
/* Look up an SADB entry which matches the address of the peer. */ |
| 2634 |
sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); |
| 2635 |
if (sav == NULL) { |
| 2636 |
ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, |
| 2637 |
(ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : |
| 2638 |
#ifdef INET6 |
| 2639 |
(ip->ip_v == (IPV6_VERSION >> 4)) ? |
| 2640 |
ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : |
| 2641 |
#endif |
| 2642 |
"(unsupported)")); |
| 2643 |
} |
| 2644 |
|
| 2645 |
return (sav); |
| 2646 |
} |
| 2647 |
|
| 2648 |
/* |
| 2649 |
* Compute TCP-MD5 hash of a TCP segment. (RFC2385) |
| 2650 |
* |
| 2651 |
* Parameters: |
| 2652 |
* m pointer to head of mbuf chain |
| 2653 |
* len length of TCP segment data, excluding options |
| 2654 |
* optlen length of TCP segment options |
| 2655 |
* buf pointer to storage for computed MD5 digest |
| 2656 |
* sav pointer to security assosiation |
| 2657 |
* |
| 2658 |
* We do this over ip, tcphdr, segment data, and the key in the SADB. |
| 2659 |
* When called from tcp_input(), we can be sure that th_sum has been |
| 2660 |
* zeroed out and verified already. |
| 2661 |
* |
| 2662 |
* Releases reference to SADB key before return. |
| 2663 |
* |
| 2664 |
* Return 0 if successful, otherwise return -1. |
| 2665 |
* |
| 2666 |
*/ |
| 2667 |
int |
| 2668 |
tcp_signature_do_compute(struct mbuf *m, int len, int optlen, |
| 2669 |
u_char *buf, struct secasvar *sav) |
| 2670 |
{ |
| 2671 |
#ifdef INET |
| 2672 |
struct ippseudo ippseudo; |
| 2673 |
#endif |
| 2674 |
MD5_CTX ctx; |
| 2675 |
int doff; |
| 2676 |
struct ip *ip; |
| 2677 |
#ifdef INET |
| 2678 |
struct ipovly *ipovly; |
| 2679 |
#endif |
| 2680 |
struct tcphdr *th; |
| 2681 |
#ifdef INET6 |
| 2682 |
struct ip6_hdr *ip6; |
| 2683 |
struct in6_addr in6; |
| 2684 |
uint32_t plen; |
| 2685 |
uint16_t nhdr; |
| 2686 |
#endif |
| 2687 |
u_short savecsum; |
| 2688 |
|
| 2689 |
KASSERT(m != NULL, ("NULL mbuf chain")); |
| 2690 |
KASSERT(buf != NULL, ("NULL signature pointer")); |
| 2691 |
|
| 2692 |
/* Extract the destination from the IP header in the mbuf. */ |
| 2693 |
ip = mtod(m, struct ip *); |
| 2694 |
#ifdef INET6 |
| 2695 |
ip6 = NULL; /* Make the compiler happy. */ |
| 2696 |
#endif |
| 2697 |
|
| 2698 |
MD5Init(&ctx); |
| 2699 |
/* |
| 2700 |
* Step 1: Update MD5 hash with IP(v6) pseudo-header. |
| 2701 |
* |
| 2702 |
* XXX The ippseudo header MUST be digested in network byte order, |
| 2703 |
* or else we'll fail the regression test. Assume all fields we've |
| 2704 |
* been doing arithmetic on have been in host byte order. |
| 2705 |
* XXX One cannot depend on ipovly->ih_len here. When called from |
| 2706 |
* tcp_output(), the underlying ip_len member has not yet been set. |
| 2707 |
*/ |
| 2708 |
switch (ip->ip_v) { |
| 2709 |
#ifdef INET |
| 2710 |
case IPVERSION: |
| 2711 |
ipovly = (struct ipovly *)ip; |
| 2712 |
ippseudo.ippseudo_src = ipovly->ih_src; |
| 2713 |
ippseudo.ippseudo_dst = ipovly->ih_dst; |
| 2714 |
ippseudo.ippseudo_pad = 0; |
| 2715 |
ippseudo.ippseudo_p = IPPROTO_TCP; |
| 2716 |
ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + |
| 2717 |
optlen); |
| 2718 |
MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); |
| 2719 |
|
| 2720 |
th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); |
| 2721 |
doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; |
| 2722 |
break; |
| 2723 |
#endif |
| 2724 |
#ifdef INET6 |
| 2725 |
/* |
| 2726 |
* RFC 2385, 2.0 Proposal |
| 2727 |
* For IPv6, the pseudo-header is as described in RFC 2460, namely the |
| 2728 |
* 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- |
| 2729 |
* extended next header value (to form 32 bits), and 32-bit segment |
| 2730 |
* length. |
| 2731 |
* Note: Upper-Layer Packet Length comes before Next Header. |
| 2732 |
*/ |
| 2733 |
case (IPV6_VERSION >> 4): |
| 2734 |
ip6 = mtod(m, struct ip6_hdr *); |
| 2735 |
in6 = ip6->ip6_src; |
| 2736 |
in6_clearscope(&in6); |
| 2737 |
MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); |
| 2738 |
in6 = ip6->ip6_dst; |
| 2739 |
in6_clearscope(&in6); |
| 2740 |
MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); |
| 2741 |
plen = htonl(len + sizeof(struct tcphdr) + optlen); |
| 2742 |
MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); |
| 2743 |
nhdr = 0; |
| 2744 |
MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
| 2745 |
MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
| 2746 |
MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
| 2747 |
nhdr = IPPROTO_TCP; |
| 2748 |
MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
| 2749 |
|
| 2750 |
th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); |
| 2751 |
doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; |
| 2752 |
break; |
| 2753 |
#endif |
| 2754 |
default: |
| 2755 |
KEY_FREESAV(&sav); |
| 2756 |
return (-1); |
| 2757 |
/* NOTREACHED */ |
| 2758 |
break; |
| 2759 |
} |
| 2760 |
|
| 2761 |
|
| 2762 |
/* |
| 2763 |
* Step 2: Update MD5 hash with TCP header, excluding options. |
| 2764 |
* The TCP checksum must be set to zero. |
| 2765 |
*/ |
| 2766 |
savecsum = th->th_sum; |
| 2767 |
th->th_sum = 0; |
| 2768 |
MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); |
| 2769 |
th->th_sum = savecsum; |
| 2770 |
|
| 2771 |
/* |
| 2772 |
* Step 3: Update MD5 hash with TCP segment data. |
| 2773 |
* Use m_apply() to avoid an early m_pullup(). |
| 2774 |
*/ |
| 2775 |
if (len > 0) |
| 2776 |
m_apply(m, doff, len, tcp_signature_apply, &ctx); |
| 2777 |
|
| 2778 |
/* |
| 2779 |
* Step 4: Update MD5 hash with shared secret. |
| 2780 |
*/ |
| 2781 |
MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); |
| 2782 |
MD5Final(buf, &ctx); |
| 2783 |
|
| 2784 |
key_sa_recordxfer(sav, m); |
| 2785 |
KEY_FREESAV(&sav); |
| 2786 |
return (0); |
| 2787 |
} |
| 2788 |
|
| 2789 |
/* |
| 2790 |
* Compute TCP-MD5 hash of a TCP segment. (RFC2385) |
| 2791 |
* |
| 2792 |
* Return 0 if successful, otherwise return -1. |
| 2793 |
*/ |
| 2794 |
int |
| 2795 |
tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, |
| 2796 |
u_char *buf, u_int direction) |
| 2797 |
{ |
| 2798 |
struct secasvar *sav; |
| 2799 |
|
| 2800 |
if ((sav = tcp_get_sav(m, direction)) == NULL) |
| 2801 |
return (-1); |
| 2802 |
|
| 2803 |
return (tcp_signature_do_compute(m, len, optlen, buf, sav)); |
| 2804 |
} |
| 2805 |
|
| 2806 |
/* |
| 2807 |
* Verify the TCP-MD5 hash of a TCP segment. (RFC2385) |
| 2808 |
* |
| 2809 |
* Parameters: |
| 2810 |
* m pointer to head of mbuf chain |
| 2811 |
* len length of TCP segment data, excluding options |
| 2812 |
* optlen length of TCP segment options |
| 2813 |
* buf pointer to storage for computed MD5 digest |
| 2814 |
* direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) |
| 2815 |
* |
| 2816 |
* Return 1 if successful, otherwise return 0. |
| 2817 |
*/ |
| 2818 |
int |
| 2819 |
tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen, |
| 2820 |
struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) |
| 2821 |
{ |
| 2822 |
char tmpdigest[TCP_SIGLEN]; |
| 2823 |
|
| 2824 |
if (tcp_sig_checksigs == 0) |
| 2825 |
return (1); |
| 2826 |
if ((tcpbflag & TF_SIGNATURE) == 0) { |
| 2827 |
if ((to->to_flags & TOF_SIGNATURE) != 0) { |
| 2828 |
|
| 2829 |
/* |
| 2830 |
* If this socket is not expecting signature but |
| 2831 |
* the segment contains signature just fail. |
| 2832 |
*/ |
| 2833 |
TCPSTAT_INC(tcps_sig_err_sigopt); |
| 2834 |
TCPSTAT_INC(tcps_sig_rcvbadsig); |
| 2835 |
return (0); |
| 2836 |
} |
| 2837 |
|
| 2838 |
/* Signature is not expected, and not present in segment. */ |
| 2839 |
return (1); |
| 2840 |
} |
| 2841 |
|
| 2842 |
/* |
| 2843 |
* If this socket is expecting signature but the segment does not |
| 2844 |
* contain any just fail. |
| 2845 |
*/ |
| 2846 |
if ((to->to_flags & TOF_SIGNATURE) == 0) { |
| 2847 |
TCPSTAT_INC(tcps_sig_err_nosigopt); |
| 2848 |
TCPSTAT_INC(tcps_sig_rcvbadsig); |
| 2849 |
return (0); |
| 2850 |
} |
| 2851 |
if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0], |
| 2852 |
IPSEC_DIR_INBOUND) == -1) { |
| 2853 |
TCPSTAT_INC(tcps_sig_err_buildsig); |
| 2854 |
TCPSTAT_INC(tcps_sig_rcvbadsig); |
| 2855 |
return (0); |
| 2856 |
} |
| 2857 |
|
| 2858 |
if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) { |
| 2859 |
TCPSTAT_INC(tcps_sig_rcvbadsig); |
| 2860 |
return (0); |
| 2861 |
} |
| 2862 |
TCPSTAT_INC(tcps_sig_rcvgoodsig); |
| 2863 |
return (1); |
| 2864 |
} |
| 2865 |
#endif /* TCP_SIGNATURE */ |
| 2866 |
|
| 2867 |
static int |
| 2868 |
sysctl_drop(SYSCTL_HANDLER_ARGS) |
| 2869 |
{ |
| 2870 |
/* addrs[0] is a foreign socket, addrs[1] is a local one. */ |
| 2871 |
struct sockaddr_storage addrs[2]; |
| 2872 |
struct inpcb *inp; |
| 2873 |
struct tcpcb *tp; |
| 2874 |
struct tcptw *tw; |
| 2875 |
struct sockaddr_in *fin, *lin; |
| 2876 |
#ifdef INET6 |
| 2877 |
struct sockaddr_in6 *fin6, *lin6; |
| 2878 |
#endif |
| 2879 |
int error; |
| 2880 |
|
| 2881 |
inp = NULL; |
| 2882 |
fin = lin = NULL; |
| 2883 |
#ifdef INET6 |
| 2884 |
fin6 = lin6 = NULL; |
| 2885 |
#endif |
| 2886 |
error = 0; |
| 2887 |
|
| 2888 |
if (req->oldptr != NULL || req->oldlen != 0) |
| 2889 |
return (EINVAL); |
| 2890 |
if (req->newptr == NULL) |
| 2891 |
return (EPERM); |
| 2892 |
if (req->newlen < sizeof(addrs)) |
| 2893 |
return (ENOMEM); |
| 2894 |
error = SYSCTL_IN(req, &addrs, sizeof(addrs)); |
| 2895 |
if (error) |
| 2896 |
return (error); |
| 2897 |
|
| 2898 |
switch (addrs[0].ss_family) { |
| 2899 |
#ifdef INET6 |
| 2900 |
case AF_INET6: |
| 2901 |
fin6 = (struct sockaddr_in6 *)&addrs[0]; |
| 2902 |
lin6 = (struct sockaddr_in6 *)&addrs[1]; |
| 2903 |
if (fin6->sin6_len != sizeof(struct sockaddr_in6) || |
| 2904 |
lin6->sin6_len != sizeof(struct sockaddr_in6)) |
| 2905 |
return (EINVAL); |
| 2906 |
if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { |
| 2907 |
if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) |
| 2908 |
return (EINVAL); |
| 2909 |
in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); |
| 2910 |
in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); |
| 2911 |
fin = (struct sockaddr_in *)&addrs[0]; |
| 2912 |
lin = (struct sockaddr_in *)&addrs[1]; |
| 2913 |
break; |
| 2914 |
} |
| 2915 |
error = sa6_embedscope(fin6, V_ip6_use_defzone); |
| 2916 |
if (error) |
| 2917 |
return (error); |
| 2918 |
error = sa6_embedscope(lin6, V_ip6_use_defzone); |
| 2919 |
if (error) |
| 2920 |
return (error); |
| 2921 |
break; |
| 2922 |
#endif |
| 2923 |
#ifdef INET |
| 2924 |
case AF_INET: |
| 2925 |
fin = (struct sockaddr_in *)&addrs[0]; |
| 2926 |
lin = (struct sockaddr_in *)&addrs[1]; |
| 2927 |
if (fin->sin_len != sizeof(struct sockaddr_in) || |
| 2928 |
lin->sin_len != sizeof(struct sockaddr_in)) |
| 2929 |
return (EINVAL); |
| 2930 |
break; |
| 2931 |
#endif |
| 2932 |
default: |
| 2933 |
return (EINVAL); |
| 2934 |
} |
| 2935 |
INP_INFO_RLOCK(&V_tcbinfo); |
| 2936 |
switch (addrs[0].ss_family) { |
| 2937 |
#ifdef INET6 |
| 2938 |
case AF_INET6: |
| 2939 |
inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, |
| 2940 |
fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, |
| 2941 |
INPLOOKUP_WLOCKPCB, NULL); |
| 2942 |
break; |
| 2943 |
#endif |
| 2944 |
#ifdef INET |
| 2945 |
case AF_INET: |
| 2946 |
inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, |
| 2947 |
lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); |
| 2948 |
break; |
| 2949 |
#endif |
| 2950 |
} |
| 2951 |
if (inp != NULL) { |
| 2952 |
if (inp->inp_flags & INP_TIMEWAIT) { |
| 2953 |
/* |
| 2954 |
* XXXRW: There currently exists a state where an |
| 2955 |
* inpcb is present, but its timewait state has been |
| 2956 |
* discarded. For now, don't allow dropping of this |
| 2957 |
* type of inpcb. |
| 2958 |
*/ |
| 2959 |
tw = intotw(inp); |
| 2960 |
if (tw != NULL) |
| 2961 |
tcp_twclose(tw, 0); |
| 2962 |
else |
| 2963 |
INP_WUNLOCK(inp); |
| 2964 |
} else if (!(inp->inp_flags & INP_DROPPED) && |
| 2965 |
!(inp->inp_socket->so_options & SO_ACCEPTCONN)) { |
| 2966 |
tp = intotcpcb(inp); |
| 2967 |
tp = tcp_drop(tp, ECONNABORTED); |
| 2968 |
if (tp != NULL) |
| 2969 |
INP_WUNLOCK(inp); |
| 2970 |
} else |
| 2971 |
INP_WUNLOCK(inp); |
| 2972 |
} else |
| 2973 |
error = ESRCH; |
| 2974 |
INP_INFO_RUNLOCK(&V_tcbinfo); |
| 2975 |
return (error); |
| 2976 |
} |
| 2977 |
|
| 2978 |
SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, |
| 2979 |
CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, |
| 2980 |
0, sysctl_drop, "", "Drop TCP connection"); |
| 2981 |
|
| 2982 |
/* |
| 2983 |
* Generate a standardized TCP log line for use throughout the |
| 2984 |
* tcp subsystem. Memory allocation is done with M_NOWAIT to |
| 2985 |
* allow use in the interrupt context. |
| 2986 |
* |
| 2987 |
* NB: The caller MUST free(s, M_TCPLOG) the returned string. |
| 2988 |
* NB: The function may return NULL if memory allocation failed. |
| 2989 |
* |
| 2990 |
* Due to header inclusion and ordering limitations the struct ip |
| 2991 |
* and ip6_hdr pointers have to be passed as void pointers. |
| 2992 |
*/ |
| 2993 |
char * |
| 2994 |
tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
| 2995 |
const void *ip6hdr) |
| 2996 |
{ |
| 2997 |
|
| 2998 |
/* Is logging enabled? */ |
| 2999 |
if (tcp_log_in_vain == 0) |
| 3000 |
return (NULL); |
| 3001 |
|
| 3002 |
return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); |
| 3003 |
} |
| 3004 |
|
| 3005 |
char * |
| 3006 |
tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
| 3007 |
const void *ip6hdr) |
| 3008 |
{ |
| 3009 |
|
| 3010 |
/* Is logging enabled? */ |
| 3011 |
if (tcp_log_debug == 0) |
| 3012 |
return (NULL); |
| 3013 |
|
| 3014 |
return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); |
| 3015 |
} |
| 3016 |
|
| 3017 |
static char * |
| 3018 |
tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
| 3019 |
const void *ip6hdr) |
| 3020 |
{ |
| 3021 |
char *s, *sp; |
| 3022 |
size_t size; |
| 3023 |
struct ip *ip; |
| 3024 |
#ifdef INET6 |
| 3025 |
const struct ip6_hdr *ip6; |
| 3026 |
|
| 3027 |
ip6 = (const struct ip6_hdr *)ip6hdr; |
| 3028 |
#endif /* INET6 */ |
| 3029 |
ip = (struct ip *)ip4hdr; |
| 3030 |
|
| 3031 |
/* |
| 3032 |
* The log line looks like this: |
| 3033 |
* "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>" |
| 3034 |
*/ |
| 3035 |
size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + |
| 3036 |
sizeof(PRINT_TH_FLAGS) + 1 + |
| 3037 |
#ifdef INET6 |
| 3038 |
2 * INET6_ADDRSTRLEN; |
| 3039 |
#else |
| 3040 |
2 * INET_ADDRSTRLEN; |
| 3041 |
#endif /* INET6 */ |
| 3042 |
|
| 3043 |
s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); |
| 3044 |
if (s == NULL) |
| 3045 |
return (NULL); |
| 3046 |
|
| 3047 |
strcat(s, "TCP: ["); |
| 3048 |
sp = s + strlen(s); |
| 3049 |
|
| 3050 |
if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { |
| 3051 |
inet_ntoa_r(inc->inc_faddr, sp); |
| 3052 |
sp = s + strlen(s); |
| 3053 |
sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); |
| 3054 |
sp = s + strlen(s); |
| 3055 |
inet_ntoa_r(inc->inc_laddr, sp); |
| 3056 |
sp = s + strlen(s); |
| 3057 |
sprintf(sp, "]:%i", ntohs(inc->inc_lport)); |
| 3058 |
#ifdef INET6 |
| 3059 |
} else if (inc) { |
| 3060 |
ip6_sprintf(sp, &inc->inc6_faddr); |
| 3061 |
sp = s + strlen(s); |
| 3062 |
sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); |
| 3063 |
sp = s + strlen(s); |
| 3064 |
ip6_sprintf(sp, &inc->inc6_laddr); |
| 3065 |
sp = s + strlen(s); |
| 3066 |
sprintf(sp, "]:%i", ntohs(inc->inc_lport)); |
| 3067 |
} else if (ip6 && th) { |
| 3068 |
ip6_sprintf(sp, &ip6->ip6_src); |
| 3069 |
sp = s + strlen(s); |
| 3070 |
sprintf(sp, "]:%i to [", ntohs(th->th_sport)); |
| 3071 |
sp = s + strlen(s); |
| 3072 |
ip6_sprintf(sp, &ip6->ip6_dst); |
| 3073 |
sp = s + strlen(s); |
| 3074 |
sprintf(sp, "]:%i", ntohs(th->th_dport)); |
| 3075 |
#endif /* INET6 */ |
| 3076 |
#ifdef INET |
| 3077 |
} else if (ip && th) { |
| 3078 |
inet_ntoa_r(ip->ip_src, sp); |
| 3079 |
sp = s + strlen(s); |
| 3080 |
sprintf(sp, "]:%i to [", ntohs(th->th_sport)); |
| 3081 |
sp = s + strlen(s); |
| 3082 |
inet_ntoa_r(ip->ip_dst, sp); |
| 3083 |
sp = s + strlen(s); |
| 3084 |
sprintf(sp, "]:%i", ntohs(th->th_dport)); |
| 3085 |
#endif /* INET */ |
| 3086 |
} else { |
| 3087 |
free(s, M_TCPLOG); |
| 3088 |
return (NULL); |
| 3089 |
} |
| 3090 |
sp = s + strlen(s); |
| 3091 |
if (th) |
| 3092 |
sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); |
| 3093 |
if (*(s + size - 1) != '\0') |
| 3094 |
panic("%s: string too long", __func__); |
| 3095 |
return (s); |
| 3096 |
} |
| 3097 |
|
| 3098 |
/* |
| 3099 |
* A subroutine which makes it easy to track TCP state changes with DTrace. |
| 3100 |
* This function shouldn't be called for t_state initializations that don't |
| 3101 |
* correspond to actual TCP state transitions. |
| 3102 |
*/ |
| 3103 |
void |
| 3104 |
tcp_state_change(struct tcpcb *tp, int newstate) |
| 3105 |
{ |
| 3106 |
#if defined(KDTRACE_HOOKS) |
| 3107 |
int pstate = tp->t_state; |
| 3108 |
#endif |
| 3109 |
|
| 3110 |
TCPSTATES_DEC(tp->t_state); |
| 3111 |
TCPSTATES_INC(newstate); |
| 3112 |
tp->t_state = newstate; |
| 3113 |
TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); |
| 3114 |
} |