1 /* $NetBSD: pf_norm.c,v 1.20 2009/01/11 10:29:01 cegger Exp $ */
2 /* $OpenBSD: pf_norm.c,v 1.109 2007/05/28 17:16:39 henning Exp $ */
5 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: pf_norm.c,v 1.20 2009/01/11 10:29:01 cegger Exp $");
38 #include <sys/param.h>
39 #include <sys/systm.h>
41 #include <sys/filio.h>
42 #include <sys/fcntl.h>
43 #include <sys/socket.h>
44 #include <sys/kernel.h>
51 #include <dev/rndvar.h>
52 #endif /* !__NetBSD__ */
54 #include <net/if_types.h>
56 #include <net/route.h>
57 #include <net/if_pflog.h>
59 #include <netinet/in.h>
60 #include <netinet/in_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/ip.h>
63 #include <netinet/ip_var.h>
64 #include <netinet/tcp.h>
65 #include <netinet/tcp_seq.h>
66 #include <netinet/udp.h>
67 #include <netinet/ip_icmp.h>
70 #include <netinet/ip6.h>
73 #include <net/pfvar.h>
76 LIST_ENTRY(pf_frent
) fr_next
;
82 LIST_ENTRY(pf_frcache
) fr_next
;
87 #define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */
88 #define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */
89 #define PFFRAG_DROP 0x0004 /* Drop all fragments */
90 #define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER))
93 RB_ENTRY(pf_fragment
) fr_entry
;
94 TAILQ_ENTRY(pf_fragment
) frag_next
;
95 struct in_addr fr_src
;
96 struct in_addr fr_dst
;
97 u_int8_t fr_p
; /* protocol of this fragment */
98 u_int8_t fr_flags
; /* status flags */
99 u_int16_t fr_id
; /* fragment id for reassemble */
100 u_int16_t fr_max
; /* fragment data max */
101 u_int32_t fr_timeout
;
102 #define fr_queue fr_u.fru_queue
103 #define fr_cache fr_u.fru_cache
105 LIST_HEAD(pf_fragq
, pf_frent
) fru_queue
; /* buffering */
106 LIST_HEAD(pf_cacheq
, pf_frcache
) fru_cache
; /* non-buf */
110 TAILQ_HEAD(pf_fragqueue
, pf_fragment
) pf_fragqueue
;
111 TAILQ_HEAD(pf_cachequeue
, pf_fragment
) pf_cachequeue
;
113 static __inline
int pf_frag_compare(struct pf_fragment
*,
114 struct pf_fragment
*);
115 RB_HEAD(pf_frag_tree
, pf_fragment
) pf_frag_tree
, pf_cache_tree
;
116 RB_PROTOTYPE(pf_frag_tree
, pf_fragment
, fr_entry
, pf_frag_compare
);
117 RB_GENERATE(pf_frag_tree
, pf_fragment
, fr_entry
, pf_frag_compare
);
119 /* Private prototypes */
120 void pf_ip2key(struct pf_fragment
*, struct ip
*);
121 void pf_remove_fragment(struct pf_fragment
*);
122 void pf_flush_fragments(void);
123 void pf_free_fragment(struct pf_fragment
*);
124 struct pf_fragment
*pf_find_fragment(struct ip
*, struct pf_frag_tree
*);
125 struct mbuf
*pf_reassemble(struct mbuf
**, struct pf_fragment
**,
126 struct pf_frent
*, int);
127 struct mbuf
*pf_fragcache(struct mbuf
**, struct ip
*,
128 struct pf_fragment
**, int, int, int *);
129 int pf_normalize_tcpopt(struct pf_rule
*, struct mbuf
*,
130 struct tcphdr
*, int);
132 #define DPFPRINTF(x) do { \
133 if (pf_status.debug >= PF_DEBUG_MISC) { \
134 printf("%s: ", __func__); \
140 struct pool pf_frent_pl
, pf_frag_pl
, pf_cache_pl
, pf_cent_pl
;
141 struct pool pf_state_scrub_pl
;
142 int pf_nfrents
, pf_ncache
;
145 pf_normalize_init(void)
148 pool_init(&pf_frent_pl
, sizeof(struct pf_frent
), 0, 0, 0, "pffrent",
150 pool_init(&pf_frag_pl
, sizeof(struct pf_fragment
), 0, 0, 0, "pffrag",
152 pool_init(&pf_cache_pl
, sizeof(struct pf_fragment
), 0, 0, 0,
153 "pffrcache", NULL
, IPL_SOFTNET
);
154 pool_init(&pf_cent_pl
, sizeof(struct pf_frcache
), 0, 0, 0, "pffrcent",
156 pool_init(&pf_state_scrub_pl
, sizeof(struct pf_state_scrub
), 0, 0, 0,
157 "pfstscr", NULL
, IPL_SOFTNET
);
159 pool_init(&pf_frent_pl
, sizeof(struct pf_frent
), 0, 0, 0, "pffrent",
161 pool_init(&pf_frag_pl
, sizeof(struct pf_fragment
), 0, 0, 0, "pffrag",
163 pool_init(&pf_cache_pl
, sizeof(struct pf_fragment
), 0, 0, 0,
165 pool_init(&pf_cent_pl
, sizeof(struct pf_frcache
), 0, 0, 0, "pffrcent",
167 pool_init(&pf_state_scrub_pl
, sizeof(struct pf_state_scrub
), 0, 0, 0,
169 #endif /* !__NetBSD__ */
171 pool_sethiwat(&pf_frag_pl
, PFFRAG_FRAG_HIWAT
);
172 pool_sethardlimit(&pf_frent_pl
, PFFRAG_FRENT_HIWAT
, NULL
, 0);
173 pool_sethardlimit(&pf_cache_pl
, PFFRAG_FRCACHE_HIWAT
, NULL
, 0);
174 pool_sethardlimit(&pf_cent_pl
, PFFRAG_FRCENT_HIWAT
, NULL
, 0);
176 TAILQ_INIT(&pf_fragqueue
);
177 TAILQ_INIT(&pf_cachequeue
);
181 pf_frag_compare(struct pf_fragment
*a
, struct pf_fragment
*b
)
185 if ((diff
= a
->fr_id
- b
->fr_id
))
187 else if ((diff
= a
->fr_p
- b
->fr_p
))
189 else if (a
->fr_src
.s_addr
< b
->fr_src
.s_addr
)
191 else if (a
->fr_src
.s_addr
> b
->fr_src
.s_addr
)
193 else if (a
->fr_dst
.s_addr
< b
->fr_dst
.s_addr
)
195 else if (a
->fr_dst
.s_addr
> b
->fr_dst
.s_addr
)
201 pf_purge_expired_fragments(void)
203 struct pf_fragment
*frag
;
204 u_int32_t expire
= time_second
-
205 pf_default_rule
.timeout
[PFTM_FRAG
];
207 while ((frag
= TAILQ_LAST(&pf_fragqueue
, pf_fragqueue
)) != NULL
) {
208 KASSERT(BUFFER_FRAGMENTS(frag
));
209 if (frag
->fr_timeout
> expire
)
212 DPFPRINTF(("expiring %d(%p)\n", frag
->fr_id
, frag
));
213 pf_free_fragment(frag
);
216 while ((frag
= TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
)) != NULL
) {
217 KASSERT(!BUFFER_FRAGMENTS(frag
));
218 if (frag
->fr_timeout
> expire
)
221 DPFPRINTF(("expiring %d(%p)\n", frag
->fr_id
, frag
));
222 pf_free_fragment(frag
);
223 KASSERT(TAILQ_EMPTY(&pf_cachequeue
) ||
224 TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
) != frag
);
229 * Try to flush old fragments to make space for new ones
233 pf_flush_fragments(void)
235 struct pf_fragment
*frag
;
238 goal
= pf_nfrents
* 9 / 10;
239 DPFPRINTF(("trying to free > %d frents\n",
241 while (goal
< pf_nfrents
) {
242 frag
= TAILQ_LAST(&pf_fragqueue
, pf_fragqueue
);
245 pf_free_fragment(frag
);
249 goal
= pf_ncache
* 9 / 10;
250 DPFPRINTF(("trying to free > %d cache entries\n",
252 while (goal
< pf_ncache
) {
253 frag
= TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
);
256 pf_free_fragment(frag
);
260 /* Frees the fragments and all associated entries */
263 pf_free_fragment(struct pf_fragment
*frag
)
265 struct pf_frent
*frent
;
266 struct pf_frcache
*frcache
;
268 /* Free all fragments */
269 if (BUFFER_FRAGMENTS(frag
)) {
270 for (frent
= LIST_FIRST(&frag
->fr_queue
); frent
;
271 frent
= LIST_FIRST(&frag
->fr_queue
)) {
272 LIST_REMOVE(frent
, fr_next
);
274 m_freem(frent
->fr_m
);
275 pool_put(&pf_frent_pl
, frent
);
279 for (frcache
= LIST_FIRST(&frag
->fr_cache
); frcache
;
280 frcache
= LIST_FIRST(&frag
->fr_cache
)) {
281 LIST_REMOVE(frcache
, fr_next
);
283 KASSERT(LIST_EMPTY(&frag
->fr_cache
) ||
284 LIST_FIRST(&frag
->fr_cache
)->fr_off
>
287 pool_put(&pf_cent_pl
, frcache
);
292 pf_remove_fragment(frag
);
296 pf_ip2key(struct pf_fragment
*key
, struct ip
*ip
)
298 key
->fr_p
= ip
->ip_p
;
299 key
->fr_id
= ip
->ip_id
;
300 key
->fr_src
.s_addr
= ip
->ip_src
.s_addr
;
301 key
->fr_dst
.s_addr
= ip
->ip_dst
.s_addr
;
305 pf_find_fragment(struct ip
*ip
, struct pf_frag_tree
*tree
)
307 struct pf_fragment key
;
308 struct pf_fragment
*frag
;
312 frag
= RB_FIND(pf_frag_tree
, tree
, &key
);
314 /* XXX Are we sure we want to update the timeout? */
315 frag
->fr_timeout
= time_second
;
316 if (BUFFER_FRAGMENTS(frag
)) {
317 TAILQ_REMOVE(&pf_fragqueue
, frag
, frag_next
);
318 TAILQ_INSERT_HEAD(&pf_fragqueue
, frag
, frag_next
);
320 TAILQ_REMOVE(&pf_cachequeue
, frag
, frag_next
);
321 TAILQ_INSERT_HEAD(&pf_cachequeue
, frag
, frag_next
);
328 /* Removes a fragment from the fragment queue and frees the fragment */
331 pf_remove_fragment(struct pf_fragment
*frag
)
333 if (BUFFER_FRAGMENTS(frag
)) {
334 RB_REMOVE(pf_frag_tree
, &pf_frag_tree
, frag
);
335 TAILQ_REMOVE(&pf_fragqueue
, frag
, frag_next
);
336 pool_put(&pf_frag_pl
, frag
);
338 RB_REMOVE(pf_frag_tree
, &pf_cache_tree
, frag
);
339 TAILQ_REMOVE(&pf_cachequeue
, frag
, frag_next
);
340 pool_put(&pf_cache_pl
, frag
);
344 #define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
346 pf_reassemble(struct mbuf
**m0
, struct pf_fragment
**frag
,
347 struct pf_frent
*frent
, int mff
)
349 struct mbuf
*m
= *m0
, *m2
;
350 struct pf_frent
*frea
, *next
;
351 struct pf_frent
*frep
= NULL
;
352 struct ip
*ip
= frent
->fr_ip
;
353 int hlen
= ip
->ip_hl
<< 2;
354 u_int16_t off
= (ntohs(ip
->ip_off
) & IP_OFFMASK
) << 3;
355 u_int16_t ip_len
= ntohs(ip
->ip_len
) - ip
->ip_hl
* 4;
356 u_int16_t max
= ip_len
+ off
;
358 KASSERT(*frag
== NULL
|| BUFFER_FRAGMENTS(*frag
));
360 /* Strip off ip header */
364 /* Create a new reassembly queue for this packet */
366 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
368 pf_flush_fragments();
369 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
374 (*frag
)->fr_flags
= 0;
376 (*frag
)->fr_src
= frent
->fr_ip
->ip_src
;
377 (*frag
)->fr_dst
= frent
->fr_ip
->ip_dst
;
378 (*frag
)->fr_p
= frent
->fr_ip
->ip_p
;
379 (*frag
)->fr_id
= frent
->fr_ip
->ip_id
;
380 (*frag
)->fr_timeout
= time_second
;
381 LIST_INIT(&(*frag
)->fr_queue
);
383 RB_INSERT(pf_frag_tree
, &pf_frag_tree
, *frag
);
384 TAILQ_INSERT_HEAD(&pf_fragqueue
, *frag
, frag_next
);
386 /* We do not have a previous fragment */
392 * Find a fragment after the current one:
393 * - off contains the real shifted offset.
395 LIST_FOREACH(frea
, &(*frag
)->fr_queue
, fr_next
) {
396 if (FR_IP_OFF(frea
) > off
)
401 KASSERT(frep
!= NULL
|| frea
!= NULL
);
404 FR_IP_OFF(frep
) + ntohs(frep
->fr_ip
->ip_len
) - frep
->fr_ip
->ip_hl
*
409 precut
= FR_IP_OFF(frep
) + ntohs(frep
->fr_ip
->ip_len
) -
410 frep
->fr_ip
->ip_hl
* 4 - off
;
411 if (precut
>= ip_len
)
413 m_adj(frent
->fr_m
, precut
);
414 DPFPRINTF(("overlap -%d\n", precut
));
415 /* Enforce 8 byte boundaries */
416 ip
->ip_off
= htons(ntohs(ip
->ip_off
) + (precut
>> 3));
417 off
= (ntohs(ip
->ip_off
) & IP_OFFMASK
) << 3;
419 ip
->ip_len
= htons(ip_len
);
422 for (; frea
!= NULL
&& ip_len
+ off
> FR_IP_OFF(frea
);
427 aftercut
= ip_len
+ off
- FR_IP_OFF(frea
);
428 DPFPRINTF(("adjust overlap %d\n", aftercut
));
429 if (aftercut
< ntohs(frea
->fr_ip
->ip_len
) - frea
->fr_ip
->ip_hl
432 frea
->fr_ip
->ip_len
=
433 htons(ntohs(frea
->fr_ip
->ip_len
) - aftercut
);
434 frea
->fr_ip
->ip_off
= htons(ntohs(frea
->fr_ip
->ip_off
) +
436 m_adj(frea
->fr_m
, aftercut
);
440 /* This fragment is completely overlapped, lose it */
441 next
= LIST_NEXT(frea
, fr_next
);
443 LIST_REMOVE(frea
, fr_next
);
444 pool_put(&pf_frent_pl
, frea
);
449 /* Update maximum data size */
450 if ((*frag
)->fr_max
< max
)
451 (*frag
)->fr_max
= max
;
452 /* This is the last segment */
454 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
457 LIST_INSERT_HEAD(&(*frag
)->fr_queue
, frent
, fr_next
);
459 LIST_INSERT_AFTER(frep
, frent
, fr_next
);
461 /* Check if we are completely reassembled */
462 if (!((*frag
)->fr_flags
& PFFRAG_SEENLAST
))
465 /* Check if we have all the data */
467 for (frep
= LIST_FIRST(&(*frag
)->fr_queue
); frep
; frep
= next
) {
468 next
= LIST_NEXT(frep
, fr_next
);
470 off
+= ntohs(frep
->fr_ip
->ip_len
) - frep
->fr_ip
->ip_hl
* 4;
471 if (off
< (*frag
)->fr_max
&&
472 (next
== NULL
|| FR_IP_OFF(next
) != off
))
474 DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
475 off
, next
== NULL
? -1 : FR_IP_OFF(next
),
480 DPFPRINTF(("%d < %d?\n", off
, (*frag
)->fr_max
));
481 if (off
< (*frag
)->fr_max
)
484 /* We have all the data */
485 frent
= LIST_FIRST(&(*frag
)->fr_queue
);
486 KASSERT(frent
!= NULL
);
487 if ((frent
->fr_ip
->ip_hl
<< 2) + off
> IP_MAXPACKET
) {
488 DPFPRINTF(("drop: too big: %d\n", off
));
489 pf_free_fragment(*frag
);
493 next
= LIST_NEXT(frent
, fr_next
);
495 /* Magic from ip_input */
501 pool_put(&pf_frent_pl
, frent
);
503 for (frent
= next
; frent
!= NULL
; frent
= next
) {
504 next
= LIST_NEXT(frent
, fr_next
);
507 pool_put(&pf_frent_pl
, frent
);
512 ip
->ip_src
= (*frag
)->fr_src
;
513 ip
->ip_dst
= (*frag
)->fr_dst
;
515 /* Remove from fragment queue */
516 pf_remove_fragment(*frag
);
519 hlen
= ip
->ip_hl
<< 2;
520 ip
->ip_len
= htons(off
+ hlen
);
524 /* some debugging cruft by sklower, below, will go away soon */
525 /* XXX this should be done elsewhere */
526 if (m
->m_flags
& M_PKTHDR
) {
528 for (m2
= m
; m2
; m2
= m2
->m_next
)
530 m
->m_pkthdr
.len
= plen
;
532 m
->m_pkthdr
.csum_flags
= 0;
533 #endif /* __NetBSD__ */
536 DPFPRINTF(("complete: %p(%d)\n", m
, ntohs(ip
->ip_len
)));
540 /* Oops - fail safe - drop packet */
541 pool_put(&pf_frent_pl
, frent
);
548 pf_fragcache(struct mbuf
**m0
, struct ip
*h
, struct pf_fragment
**frag
, int mff
,
549 int drop
, int *nomem
)
551 struct mbuf
*m
= *m0
;
552 struct pf_frcache
*frp
, *fra
, *cur
= NULL
;
553 int ip_len
= ntohs(h
->ip_len
) - (h
->ip_hl
<< 2);
554 u_int16_t off
= ntohs(h
->ip_off
) << 3;
555 u_int16_t max
= ip_len
+ off
;
558 KASSERT(*frag
== NULL
|| !BUFFER_FRAGMENTS(*frag
));
560 /* Create a new range queue for this packet */
562 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
564 pf_flush_fragments();
565 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
570 /* Get an entry for the queue */
571 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
573 pool_put(&pf_cache_pl
, *frag
);
579 (*frag
)->fr_flags
= PFFRAG_NOBUFFER
;
581 (*frag
)->fr_src
= h
->ip_src
;
582 (*frag
)->fr_dst
= h
->ip_dst
;
583 (*frag
)->fr_p
= h
->ip_p
;
584 (*frag
)->fr_id
= h
->ip_id
;
585 (*frag
)->fr_timeout
= time_second
;
589 LIST_INIT(&(*frag
)->fr_cache
);
590 LIST_INSERT_HEAD(&(*frag
)->fr_cache
, cur
, fr_next
);
592 RB_INSERT(pf_frag_tree
, &pf_cache_tree
, *frag
);
593 TAILQ_INSERT_HEAD(&pf_cachequeue
, *frag
, frag_next
);
595 DPFPRINTF(("fragcache[%d]: new %d-%d\n", h
->ip_id
, off
, max
));
601 * Find a fragment after the current one:
602 * - off contains the real shifted offset.
605 LIST_FOREACH(fra
, &(*frag
)->fr_cache
, fr_next
) {
606 if (fra
->fr_off
> off
)
611 KASSERT(frp
!= NULL
|| fra
!= NULL
);
616 precut
= frp
->fr_end
- off
;
617 if (precut
>= ip_len
) {
618 /* Fragment is entirely a duplicate */
619 DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
620 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, max
));
624 /* They are adjacent. Fixup cache entry */
625 DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
626 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, max
));
628 } else if (precut
> 0) {
629 /* The first part of this payload overlaps with a
630 * fragment that has already been passed.
631 * Need to trim off the first part of the payload.
632 * But to do so easily, we need to create another
633 * mbuf to throw the original header into.
636 DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
637 h
->ip_id
, precut
, frp
->fr_off
, frp
->fr_end
, off
,
642 /* Update the previous frag to encompass this one */
646 /* XXX Optimization opportunity
647 * This is a very heavy way to trim the payload.
648 * we could do it much faster by diddling mbuf
649 * internals but that would be even less legible
650 * than this mbuf magic. For my next trick,
651 * I'll pull a rabbit out of my laptop.
653 *m0
= m_dup(m
, 0, h
->ip_hl
<< 2, M_NOWAIT
);
656 KASSERT((*m0
)->m_next
== NULL
);
657 m_adj(m
, precut
+ (h
->ip_hl
<< 2));
660 if (m
->m_flags
& M_PKTHDR
) {
663 for (t
= m
; t
; t
= t
->m_next
)
665 m
->m_pkthdr
.len
= plen
;
669 h
= mtod(m
, struct ip
*);
672 KASSERT((int)m
->m_len
==
673 ntohs(h
->ip_len
) - precut
);
674 h
->ip_off
= htons(ntohs(h
->ip_off
) +
676 h
->ip_len
= htons(ntohs(h
->ip_len
) - precut
);
681 /* There is a gap between fragments */
683 DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
684 h
->ip_id
, -precut
, frp
->fr_off
, frp
->fr_end
, off
,
687 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
694 LIST_INSERT_AFTER(frp
, cur
, fr_next
);
702 aftercut
= max
- fra
->fr_off
;
704 /* Adjacent fragments */
705 DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
706 h
->ip_id
, off
, max
, fra
->fr_off
, fra
->fr_end
));
709 } else if (aftercut
> 0) {
710 /* Need to chop off the tail of this fragment */
711 DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
712 h
->ip_id
, aftercut
, off
, max
, fra
->fr_off
,
721 if (m
->m_flags
& M_PKTHDR
) {
724 for (t
= m
; t
; t
= t
->m_next
)
726 m
->m_pkthdr
.len
= plen
;
728 h
= mtod(m
, struct ip
*);
729 KASSERT((int)m
->m_len
==
730 ntohs(h
->ip_len
) - aftercut
);
731 h
->ip_len
= htons(ntohs(h
->ip_len
) - aftercut
);
735 } else if (frp
== NULL
) {
736 /* There is a gap between fragments */
737 DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
738 h
->ip_id
, -aftercut
, off
, max
, fra
->fr_off
,
741 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
748 LIST_INSERT_BEFORE(fra
, cur
, fr_next
);
752 /* Need to glue together two separate fragment descriptors */
754 if (cur
&& fra
->fr_off
<= cur
->fr_end
) {
755 /* Need to merge in a previous 'cur' */
756 DPFPRINTF(("fragcache[%d]: adjacent(merge "
757 "%d-%d) %d-%d (%d-%d)\n",
758 h
->ip_id
, cur
->fr_off
, cur
->fr_end
, off
,
759 max
, fra
->fr_off
, fra
->fr_end
));
760 fra
->fr_off
= cur
->fr_off
;
761 LIST_REMOVE(cur
, fr_next
);
762 pool_put(&pf_cent_pl
, cur
);
766 } else if (frp
&& fra
->fr_off
<= frp
->fr_end
) {
767 /* Need to merge in a modified 'frp' */
768 KASSERT(cur
== NULL
);
769 DPFPRINTF(("fragcache[%d]: adjacent(merge "
770 "%d-%d) %d-%d (%d-%d)\n",
771 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
,
772 max
, fra
->fr_off
, fra
->fr_end
));
773 fra
->fr_off
= frp
->fr_off
;
774 LIST_REMOVE(frp
, fr_next
);
775 pool_put(&pf_cent_pl
, frp
);
785 * We must keep tracking the overall fragment even when
786 * we're going to drop it anyway so that we know when to
787 * free the overall descriptor. Thus we drop the frag late.
794 /* Update maximum data size */
795 if ((*frag
)->fr_max
< max
)
796 (*frag
)->fr_max
= max
;
798 /* This is the last segment */
800 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
802 /* Check if we are completely reassembled */
803 if (((*frag
)->fr_flags
& PFFRAG_SEENLAST
) &&
804 LIST_FIRST(&(*frag
)->fr_cache
)->fr_off
== 0 &&
805 LIST_FIRST(&(*frag
)->fr_cache
)->fr_end
== (*frag
)->fr_max
) {
806 /* Remove from fragment queue */
807 DPFPRINTF(("fragcache[%d]: done 0-%d\n", h
->ip_id
,
809 pf_free_fragment(*frag
);
818 /* Still need to pay attention to !IP_MF */
819 if (!mff
&& *frag
!= NULL
)
820 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
827 /* Still need to pay attention to !IP_MF */
828 if (!mff
&& *frag
!= NULL
)
829 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
832 /* This fragment has been deemed bad. Don't reass */
833 if (((*frag
)->fr_flags
& PFFRAG_DROP
) == 0)
834 DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
836 (*frag
)->fr_flags
|= PFFRAG_DROP
;
844 pf_normalize_ip(struct mbuf
**m0
, int dir
, struct pfi_kif
*kif
, u_short
*reason
,
847 struct mbuf
*m
= *m0
;
849 struct pf_frent
*frent
;
850 struct pf_fragment
*frag
= NULL
;
851 struct ip
*h
= mtod(m
, struct ip
*);
852 int mff
= (ntohs(h
->ip_off
) & IP_MF
);
853 int hlen
= h
->ip_hl
<< 2;
854 u_int16_t fragoff
= (ntohs(h
->ip_off
) & IP_OFFMASK
) << 3;
859 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
862 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
863 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
864 else if (r
->direction
&& r
->direction
!= dir
)
865 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
866 else if (r
->af
&& r
->af
!= AF_INET
)
867 r
= r
->skip
[PF_SKIP_AF
].ptr
;
868 else if (r
->proto
&& r
->proto
!= h
->ip_p
)
869 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
870 else if (PF_MISMATCHAW(&r
->src
.addr
,
871 (struct pf_addr
*)&h
->ip_src
.s_addr
, AF_INET
,
873 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
874 else if (PF_MISMATCHAW(&r
->dst
.addr
,
875 (struct pf_addr
*)&h
->ip_dst
.s_addr
, AF_INET
,
877 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
882 if (r
== NULL
|| r
->action
== PF_NOSCRUB
)
885 r
->packets
[dir
== PF_OUT
]++;
886 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
889 /* Check for illegal packets */
890 if (hlen
< (int)sizeof(struct ip
))
893 if (hlen
> ntohs(h
->ip_len
))
896 /* Clear IP_DF if the rule uses the no-df option */
897 if (r
->rule_flag
& PFRULE_NODF
&& h
->ip_off
& htons(IP_DF
)) {
898 u_int16_t ip_off
= h
->ip_off
;
900 h
->ip_off
&= htons(~IP_DF
);
901 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_off
, h
->ip_off
, 0);
904 /* We will need other tests here */
905 if (!fragoff
&& !mff
)
908 /* We're dealing with a fragment now. Don't allow fragments
909 * with IP_DF to enter the cache. If the flag was cleared by
910 * no-df above, fine. Otherwise drop it.
912 if (h
->ip_off
& htons(IP_DF
)) {
913 DPFPRINTF(("IP_DF\n"));
917 ip_len
= ntohs(h
->ip_len
) - hlen
;
918 ip_off
= (ntohs(h
->ip_off
) & IP_OFFMASK
) << 3;
920 /* All fragments are 8 byte aligned */
921 if (mff
&& (ip_len
& 0x7)) {
922 DPFPRINTF(("mff and %d\n", ip_len
));
926 /* Respect maximum length */
927 if (fragoff
+ ip_len
> IP_MAXPACKET
) {
928 DPFPRINTF(("max packet %d\n", fragoff
+ ip_len
));
931 max
= fragoff
+ ip_len
;
933 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0) {
934 /* Fully buffer all of the fragments */
936 frag
= pf_find_fragment(h
, &pf_frag_tree
);
938 /* Check if we saw the last fragment already */
939 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
943 /* Get an entry for the fragment queue */
944 frent
= pool_get(&pf_frent_pl
, PR_NOWAIT
);
946 REASON_SET(reason
, PFRES_MEMORY
);
953 /* Might return a completely reassembled mbuf, or NULL */
954 DPFPRINTF(("reass frag %d @ %d-%d\n", h
->ip_id
, fragoff
, max
));
955 *m0
= m
= pf_reassemble(m0
, &frag
, frent
, mff
);
960 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
))
963 h
= mtod(m
, struct ip
*);
965 /* non-buffering fragment cache (drops or masks overlaps) */
969 struct pf_mtag
*pf_mtag
= pf_find_mtag(m
);
970 KASSERT(pf_mtag
!= NULL
);
972 if (dir
== PF_OUT
&& pf_mtag
->flags
& PF_TAG_FRAGCACHE
) {
974 if (dir
== PF_OUT
&& m
->m_pkthdr
.pf
.flags
& PF_TAG_FRAGCACHE
) {
975 #endif /* !__NetBSD__ */
977 * Already passed the fragment cache in the
978 * input direction. If we continued, it would
979 * appear to be a dup and would be dropped.
984 frag
= pf_find_fragment(h
, &pf_cache_tree
);
986 /* Check if we saw the last fragment already */
987 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
988 max
> frag
->fr_max
) {
989 if (r
->rule_flag
& PFRULE_FRAGDROP
)
990 frag
->fr_flags
|= PFFRAG_DROP
;
994 *m0
= m
= pf_fragcache(m0
, h
, &frag
, mff
,
995 (r
->rule_flag
& PFRULE_FRAGDROP
) ? 1 : 0, &nomem
);
1004 pf_mtag
= pf_find_mtag(m
);
1005 KASSERT(pf_mtag
!= NULL
);
1007 pf_mtag
->flags
|= PF_TAG_FRAGCACHE
;
1009 m
->m_pkthdr
.pf
.flags
|= PF_TAG_FRAGCACHE
;
1010 #endif /* !__NetBSD__ */
1012 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
))
1018 /* At this point, only IP_DF is allowed in ip_off */
1019 if (h
->ip_off
& ~htons(IP_DF
)) {
1020 u_int16_t ip_off
= h
->ip_off
;
1022 h
->ip_off
&= htons(IP_DF
);
1023 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_off
, h
->ip_off
, 0);
1026 /* Enforce a minimum ttl, may cause endless packet loops */
1027 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
1028 u_int16_t ip_ttl
= h
->ip_ttl
;
1030 h
->ip_ttl
= r
->min_ttl
;
1031 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
1034 if (r
->rule_flag
& PFRULE_RANDOMID
) {
1035 u_int16_t ip_id
= h
->ip_id
;
1037 h
->ip_id
= ip_randomid(0);
1038 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_id
, h
->ip_id
, 0);
1040 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0)
1041 pd
->flags
|= PFDESC_IP_REAS
;
1046 /* Enforce a minimum ttl, may cause endless packet loops */
1047 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
1048 u_int16_t ip_ttl
= h
->ip_ttl
;
1050 h
->ip_ttl
= r
->min_ttl
;
1051 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
1053 if ((r
->rule_flag
& (PFRULE_FRAGCROP
|PFRULE_FRAGDROP
)) == 0)
1054 pd
->flags
|= PFDESC_IP_REAS
;
1058 REASON_SET(reason
, PFRES_MEMORY
);
1059 if (r
!= NULL
&& r
->log
)
1060 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1064 REASON_SET(reason
, PFRES_NORM
);
1065 if (r
!= NULL
&& r
->log
)
1066 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1070 DPFPRINTF(("dropping bad fragment\n"));
1072 /* Free associated fragments */
1074 pf_free_fragment(frag
);
1076 REASON_SET(reason
, PFRES_FRAG
);
1077 if (r
!= NULL
&& r
->log
)
1078 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1085 pf_normalize_ip6(struct mbuf
**m0
, int dir
, struct pfi_kif
*kif
,
1086 u_short
*reason
, struct pf_pdesc
*pd
)
1088 struct mbuf
*m
= *m0
;
1090 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1094 struct ip6_opt_jumbo jumbo
;
1095 struct ip6_frag frag
;
1096 u_int32_t jumbolen
= 0, plen
;
1097 u_int16_t fragoff
= 0;
1103 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
1106 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
1107 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
1108 else if (r
->direction
&& r
->direction
!= dir
)
1109 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
1110 else if (r
->af
&& r
->af
!= AF_INET6
)
1111 r
= r
->skip
[PF_SKIP_AF
].ptr
;
1112 #if 0 /* header chain! */
1113 else if (r
->proto
&& r
->proto
!= h
->ip6_nxt
)
1114 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
1116 else if (PF_MISMATCHAW(&r
->src
.addr
,
1117 (struct pf_addr
*)&h
->ip6_src
, AF_INET6
,
1119 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
1120 else if (PF_MISMATCHAW(&r
->dst
.addr
,
1121 (struct pf_addr
*)&h
->ip6_dst
, AF_INET6
,
1123 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
1128 if (r
== NULL
|| r
->action
== PF_NOSCRUB
)
1131 r
->packets
[dir
== PF_OUT
]++;
1132 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
1135 /* Check for illegal packets */
1136 if (sizeof(struct ip6_hdr
) + IPV6_MAXPACKET
< m
->m_pkthdr
.len
)
1139 off
= sizeof(struct ip6_hdr
);
1144 case IPPROTO_FRAGMENT
:
1148 case IPPROTO_ROUTING
:
1149 case IPPROTO_DSTOPTS
:
1150 if (!pf_pull_hdr(m
, off
, &ext
, sizeof(ext
), NULL
,
1153 if (proto
== IPPROTO_AH
)
1154 off
+= (ext
.ip6e_len
+ 2) * 4;
1156 off
+= (ext
.ip6e_len
+ 1) * 8;
1157 proto
= ext
.ip6e_nxt
;
1159 case IPPROTO_HOPOPTS
:
1160 if (!pf_pull_hdr(m
, off
, &ext
, sizeof(ext
), NULL
,
1163 optend
= off
+ (ext
.ip6e_len
+ 1) * 8;
1164 ooff
= off
+ sizeof(ext
);
1166 if (!pf_pull_hdr(m
, ooff
, &opt
.ip6o_type
,
1167 sizeof(opt
.ip6o_type
), NULL
, NULL
,
1170 if (opt
.ip6o_type
== IP6OPT_PAD1
) {
1174 if (!pf_pull_hdr(m
, ooff
, &opt
, sizeof(opt
),
1175 NULL
, NULL
, AF_INET6
))
1177 if (ooff
+ sizeof(opt
) + opt
.ip6o_len
> optend
)
1179 switch (opt
.ip6o_type
) {
1181 if (h
->ip6_plen
!= 0)
1183 if (!pf_pull_hdr(m
, ooff
, &jumbo
,
1184 sizeof(jumbo
), NULL
, NULL
,
1187 memcpy(&jumbolen
, jumbo
.ip6oj_jumbo_len
,
1189 jumbolen
= ntohl(jumbolen
);
1190 if (jumbolen
<= IPV6_MAXPACKET
)
1192 if (sizeof(struct ip6_hdr
) + jumbolen
!=
1199 ooff
+= sizeof(opt
) + opt
.ip6o_len
;
1200 } while (ooff
< optend
);
1203 proto
= ext
.ip6e_nxt
;
1209 } while (!terminal
);
1211 /* jumbo payload option must be present, or plen > 0 */
1212 if (ntohs(h
->ip6_plen
) == 0)
1215 plen
= ntohs(h
->ip6_plen
);
1218 if (sizeof(struct ip6_hdr
) + plen
> m
->m_pkthdr
.len
)
1221 /* Enforce a minimum ttl, may cause endless packet loops */
1222 if (r
->min_ttl
&& h
->ip6_hlim
< r
->min_ttl
)
1223 h
->ip6_hlim
= r
->min_ttl
;
1228 if (ntohs(h
->ip6_plen
) == 0 || jumbolen
)
1230 plen
= ntohs(h
->ip6_plen
);
1232 if (!pf_pull_hdr(m
, off
, &frag
, sizeof(frag
), NULL
, NULL
, AF_INET6
))
1234 fragoff
= ntohs(frag
.ip6f_offlg
& IP6F_OFF_MASK
);
1235 if (fragoff
+ (plen
- off
- sizeof(frag
)) > IPV6_MAXPACKET
)
1238 /* do something about it */
1239 /* remember to set pd->flags |= PFDESC_IP_REAS */
1243 REASON_SET(reason
, PFRES_SHORT
);
1244 if (r
!= NULL
&& r
->log
)
1245 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1249 REASON_SET(reason
, PFRES_NORM
);
1250 if (r
!= NULL
&& r
->log
)
1251 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1255 REASON_SET(reason
, PFRES_FRAG
);
1256 if (r
!= NULL
&& r
->log
)
1257 PFLOG_PACKET(kif
, h
, m
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
1263 pf_normalize_tcp(int dir
, struct pfi_kif
*kif
, struct mbuf
*m
,
1264 int ipoff
, int off
, void *h
, struct pf_pdesc
*pd
)
1266 struct pf_rule
*r
, *rm
= NULL
;
1267 struct tcphdr
*th
= pd
->hdr
.tcp
;
1271 sa_family_t af
= pd
->af
;
1273 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
1276 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
)
1277 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
1278 else if (r
->direction
&& r
->direction
!= dir
)
1279 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
1280 else if (r
->af
&& r
->af
!= af
)
1281 r
= r
->skip
[PF_SKIP_AF
].ptr
;
1282 else if (r
->proto
&& r
->proto
!= pd
->proto
)
1283 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
1284 else if (PF_MISMATCHAW(&r
->src
.addr
, pd
->src
, af
,
1286 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
1287 else if (r
->src
.port_op
&& !pf_match_port(r
->src
.port_op
,
1288 r
->src
.port
[0], r
->src
.port
[1], th
->th_sport
))
1289 r
= r
->skip
[PF_SKIP_SRC_PORT
].ptr
;
1290 else if (PF_MISMATCHAW(&r
->dst
.addr
, pd
->dst
, af
,
1292 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
1293 else if (r
->dst
.port_op
&& !pf_match_port(r
->dst
.port_op
,
1294 r
->dst
.port
[0], r
->dst
.port
[1], th
->th_dport
))
1295 r
= r
->skip
[PF_SKIP_DST_PORT
].ptr
;
1296 else if (r
->os_fingerprint
!= PF_OSFP_ANY
&& !pf_osfp_match(
1297 pf_osfp_fingerprint(pd
, m
, off
, th
),
1299 r
= TAILQ_NEXT(r
, entries
);
1306 if (rm
== NULL
|| rm
->action
== PF_NOSCRUB
)
1309 r
->packets
[dir
== PF_OUT
]++;
1310 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
1313 if (rm
->rule_flag
& PFRULE_REASSEMBLE_TCP
)
1314 pd
->flags
|= PFDESC_TCP_NORM
;
1316 flags
= th
->th_flags
;
1317 if (flags
& TH_SYN
) {
1318 /* Illegal packet */
1325 /* Illegal packet */
1326 if (!(flags
& (TH_ACK
|TH_RST
)))
1330 if (!(flags
& TH_ACK
)) {
1331 /* These flags are only valid if ACK is set */
1332 if ((flags
& TH_FIN
) || (flags
& TH_PUSH
) || (flags
& TH_URG
))
1336 /* Check for illegal header length */
1337 if (th
->th_off
< (sizeof(struct tcphdr
) >> 2))
1340 /* If flags changed, or reserved data set, then adjust */
1341 if (flags
!= th
->th_flags
|| th
->th_x2
!= 0) {
1344 ov
= *(u_int16_t
*)(&th
->th_ack
+ 1);
1345 th
->th_flags
= flags
;
1347 nv
= *(u_int16_t
*)(&th
->th_ack
+ 1);
1349 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, ov
, nv
, 0);
1353 /* Remove urgent pointer, if TH_URG is not set */
1354 if (!(flags
& TH_URG
) && th
->th_urp
) {
1355 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, th
->th_urp
, 0, 0);
1360 /* Process options */
1361 if (r
->max_mss
&& pf_normalize_tcpopt(r
, m
, th
, off
))
1364 /* copy back packet headers if we sanitized */
1366 m_copyback(m
, off
, sizeof(*th
), th
);
1371 REASON_SET(&reason
, PFRES_NORM
);
1372 if (rm
!= NULL
&& r
->log
)
1373 PFLOG_PACKET(kif
, h
, m
, AF_INET
, dir
, reason
, r
, NULL
, NULL
, pd
);
1378 pf_normalize_tcp_init(struct mbuf
*m
, int off
, struct pf_pdesc
*pd
,
1379 struct tcphdr
*th
, struct pf_state_peer
*src
,
1380 struct pf_state_peer
*dst
)
1382 u_int32_t tsval
, tsecr
;
1386 KASSERT(src
->scrub
== NULL
);
1388 src
->scrub
= pool_get(&pf_state_scrub_pl
, PR_NOWAIT
);
1389 if (src
->scrub
== NULL
)
1391 bzero(src
->scrub
, sizeof(*src
->scrub
));
1396 struct ip
*h
= mtod(m
, struct ip
*);
1397 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
1403 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1404 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
1412 * All normalizations below are only begun if we see the start of
1413 * the connections. They must all set an enabled bit in pfss_flags
1415 if ((th
->th_flags
& TH_SYN
) == 0)
1419 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) && src
->scrub
&&
1420 pf_pull_hdr(m
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
1421 /* Diddle with TCP options */
1423 opt
= hdr
+ sizeof(struct tcphdr
);
1424 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
1425 while (hlen
>= TCPOLEN_TIMESTAMP
) {
1427 case TCPOPT_EOL
: /* FALLTHROUGH */
1432 case TCPOPT_TIMESTAMP
:
1433 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
1434 src
->scrub
->pfss_flags
|=
1436 src
->scrub
->pfss_ts_mod
=
1437 htonl(arc4random());
1439 /* note PFSS_PAWS not set yet */
1440 memcpy(&tsval
, &opt
[2],
1442 memcpy(&tsecr
, &opt
[6],
1444 src
->scrub
->pfss_tsval0
= ntohl(tsval
);
1445 src
->scrub
->pfss_tsval
= ntohl(tsval
);
1446 src
->scrub
->pfss_tsecr
= ntohl(tsecr
);
1447 getmicrouptime(&src
->scrub
->pfss_last
);
1451 hlen
-= MAX(opt
[1], 2);
1452 opt
+= MAX(opt
[1], 2);
1462 pf_normalize_tcp_cleanup(struct pf_state
*state
)
1464 if (state
->src
.scrub
)
1465 pool_put(&pf_state_scrub_pl
, state
->src
.scrub
);
1466 if (state
->dst
.scrub
)
1467 pool_put(&pf_state_scrub_pl
, state
->dst
.scrub
);
1469 /* Someday... flush the TCP segment reassembly descriptors. */
1473 pf_normalize_tcp_stateful(struct mbuf
*m
, int off
, struct pf_pdesc
*pd
,
1474 u_short
*reason
, struct tcphdr
*th
, struct pf_state
*state
,
1475 struct pf_state_peer
*src
, struct pf_state_peer
*dst
, int *writeback
)
1477 struct timeval uptime
;
1478 u_int32_t tsval
, tsecr
;
1479 u_int tsval_from_last
;
1485 KASSERT(src
->scrub
|| dst
->scrub
);
1488 * Enforce the minimum TTL seen for this connection. Negate a common
1489 * technique to evade an intrusion detection system and confuse
1490 * firewall state code.
1496 struct ip
*h
= mtod(m
, struct ip
*);
1497 if (h
->ip_ttl
> src
->scrub
->pfss_ttl
)
1498 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
1499 h
->ip_ttl
= src
->scrub
->pfss_ttl
;
1507 struct ip6_hdr
*h
= mtod(m
, struct ip6_hdr
*);
1508 if (h
->ip6_hlim
> src
->scrub
->pfss_ttl
)
1509 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
1510 h
->ip6_hlim
= src
->scrub
->pfss_ttl
;
1517 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) &&
1518 ((src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) ||
1519 (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
))) &&
1520 pf_pull_hdr(m
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
1521 /* Diddle with TCP options */
1523 opt
= hdr
+ sizeof(struct tcphdr
);
1524 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
1525 while (hlen
>= TCPOLEN_TIMESTAMP
) {
1527 case TCPOPT_EOL
: /* FALLTHROUGH */
1532 case TCPOPT_TIMESTAMP
:
1533 /* Modulate the timestamps. Can be used for
1534 * NAT detection, OS uptime determination or
1539 /* Huh? Multiple timestamps!? */
1540 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1541 DPFPRINTF(("multiple TS??"));
1542 pf_print_state(state
);
1545 REASON_SET(reason
, PFRES_TS
);
1548 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
1549 memcpy(&tsval
, &opt
[2],
1551 if (tsval
&& src
->scrub
&&
1552 (src
->scrub
->pfss_flags
&
1554 tsval
= ntohl(tsval
);
1555 pf_change_a(&opt
[2],
1558 src
->scrub
->pfss_ts_mod
),
1563 /* Modulate TS reply iff valid (!0) */
1564 memcpy(&tsecr
, &opt
[6],
1566 if (tsecr
&& dst
->scrub
&&
1567 (dst
->scrub
->pfss_flags
&
1569 tsecr
= ntohl(tsecr
)
1570 - dst
->scrub
->pfss_ts_mod
;
1571 pf_change_a(&opt
[6],
1572 &th
->th_sum
, htonl(tsecr
),
1580 hlen
-= MAX(opt
[1], 2);
1581 opt
+= MAX(opt
[1], 2);
1586 /* Copyback the options, caller copys back header */
1588 m_copyback(m
, off
+ sizeof(struct tcphdr
),
1589 (th
->th_off
<< 2) - sizeof(struct tcphdr
), hdr
+
1590 sizeof(struct tcphdr
));
1596 * Must invalidate PAWS checks on connections idle for too long.
1597 * The fastest allowed timestamp clock is 1ms. That turns out to
1598 * be about 24 days before it wraps. XXX Right now our lowerbound
1599 * TS echo check only works for the first 12 days of a connection
1600 * when the TS has exhausted half its 32bit space
1602 #define TS_MAX_IDLE (24*24*60*60)
1603 #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */
1605 getmicrouptime(&uptime
);
1606 if (src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1607 (uptime
.tv_sec
- src
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
||
1608 time_second
- state
->creation
> TS_MAX_CONN
)) {
1609 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1610 DPFPRINTF(("src idled out of PAWS\n"));
1611 pf_print_state(state
);
1614 src
->scrub
->pfss_flags
= (src
->scrub
->pfss_flags
& ~PFSS_PAWS
)
1617 if (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1618 uptime
.tv_sec
- dst
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
) {
1619 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1620 DPFPRINTF(("dst idled out of PAWS\n"));
1621 pf_print_state(state
);
1624 dst
->scrub
->pfss_flags
= (dst
->scrub
->pfss_flags
& ~PFSS_PAWS
)
1628 if (got_ts
&& src
->scrub
&& dst
->scrub
&&
1629 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1630 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
1631 /* Validate that the timestamps are "in-window".
1632 * RFC1323 describes TCP Timestamp options that allow
1633 * measurement of RTT (round trip time) and PAWS
1634 * (protection against wrapped sequence numbers). PAWS
1635 * gives us a set of rules for rejecting packets on
1636 * long fat pipes (packets that were somehow delayed
1637 * in transit longer than the time it took to send the
1638 * full TCP sequence space of 4Gb). We can use these
1639 * rules and infer a few others that will let us treat
1640 * the 32bit timestamp and the 32bit echoed timestamp
1641 * as sequence numbers to prevent a blind attacker from
1642 * inserting packets into a connection.
1645 * - The timestamp on this packet must be greater than
1646 * or equal to the last value echoed by the other
1647 * endpoint. The RFC says those will be discarded
1648 * since it is a dup that has already been acked.
1649 * This gives us a lowerbound on the timestamp.
1650 * timestamp >= other last echoed timestamp
1651 * - The timestamp will be less than or equal to
1652 * the last timestamp plus the time between the
1653 * last packet and now. The RFC defines the max
1654 * clock rate as 1ms. We will allow clocks to be
1655 * up to 10% fast and will allow a total difference
1656 * or 30 seconds due to a route change. And this
1657 * gives us an upperbound on the timestamp.
1658 * timestamp <= last timestamp + max ticks
1659 * We have to be careful here. Windows will send an
1660 * initial timestamp of zero and then initialize it
1661 * to a random value after the 3whs; presumably to
1662 * avoid a DoS by having to call an expensive RNG
1663 * during a SYN flood. Proof MS has at least one
1664 * good security geek.
1666 * - The TCP timestamp option must also echo the other
1667 * endpoints timestamp. The timestamp echoed is the
1668 * one carried on the earliest unacknowledged segment
1669 * on the left edge of the sequence window. The RFC
1670 * states that the host will reject any echoed
1671 * timestamps that were larger than any ever sent.
1672 * This gives us an upperbound on the TS echo.
1673 * tescr <= largest_tsval
1674 * - The lowerbound on the TS echo is a little more
1675 * tricky to determine. The other endpoint's echoed
1676 * values will not decrease. But there may be
1677 * network conditions that re-order packets and
1678 * cause our view of them to decrease. For now the
1679 * only lowerbound we can safely determine is that
1680 * the TS echo will never be less than the original
1681 * TS. XXX There is probably a better lowerbound.
1682 * Remove TS_MAX_CONN with better lowerbound check.
1683 * tescr >= other original TS
1685 * It is also important to note that the fastest
1686 * timestamp clock of 1ms will wrap its 32bit space in
1687 * 24 days. So we just disable TS checking after 24
1688 * days of idle time. We actually must use a 12d
1689 * connection limit until we can come up with a better
1690 * lowerbound to the TS echo check.
1692 struct timeval delta_ts
;
1697 * PFTM_TS_DIFF is how many seconds of leeway to allow
1698 * a host's timestamp. This can happen if the previous
1699 * packet got delayed in transit for much longer than
1702 if ((ts_fudge
= state
->rule
.ptr
->timeout
[PFTM_TS_DIFF
]) == 0)
1703 ts_fudge
= pf_default_rule
.timeout
[PFTM_TS_DIFF
];
1706 /* Calculate max ticks since the last timestamp */
1707 #define TS_MAXFREQ 1100 /* RFC max TS freq of 1 kHz + 10% skew */
1708 #define TS_MICROSECS 1000000 /* microseconds per second */
1709 timersub(&uptime
, &src
->scrub
->pfss_last
, &delta_ts
);
1710 tsval_from_last
= (delta_ts
.tv_sec
+ ts_fudge
) * TS_MAXFREQ
;
1711 tsval_from_last
+= delta_ts
.tv_usec
/ (TS_MICROSECS
/TS_MAXFREQ
);
1714 if ((src
->state
>= TCPS_ESTABLISHED
&&
1715 dst
->state
>= TCPS_ESTABLISHED
) &&
1716 (SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ||
1717 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+ tsval_from_last
) ||
1718 (tsecr
&& (SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ||
1719 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
))))) {
1720 /* Bad RFC1323 implementation or an insertion attack.
1722 * - Solaris 2.6 and 2.7 are known to send another ACK
1723 * after the FIN,FIN|ACK,ACK closing that carries
1727 DPFPRINTF(("Timestamp failed %c%c%c%c\n",
1728 SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ? '0' : ' ',
1729 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+
1730 tsval_from_last
) ? '1' : ' ',
1731 SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ? '2' : ' ',
1732 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
)? '3' : ' '));
1733 DPFPRINTF((" tsval: %" PRIu32
" tsecr: %" PRIu32
1734 " +ticks: %" PRIu32
" idle: %"PRIx64
"s %ums\n",
1735 tsval
, tsecr
, tsval_from_last
, delta_ts
.tv_sec
,
1736 delta_ts
.tv_usec
/ 1000U));
1737 DPFPRINTF((" src->tsval: %" PRIu32
" tsecr: %" PRIu32
1739 src
->scrub
->pfss_tsval
, src
->scrub
->pfss_tsecr
));
1740 DPFPRINTF((" dst->tsval: %" PRIu32
" tsecr: %" PRIu32
1741 " tsval0: %" PRIu32
"\n",
1742 dst
->scrub
->pfss_tsval
,
1743 dst
->scrub
->pfss_tsecr
, dst
->scrub
->pfss_tsval0
));
1744 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1745 pf_print_state(state
);
1746 pf_print_flags(th
->th_flags
);
1749 REASON_SET(reason
, PFRES_TS
);
1753 /* XXX I'd really like to require tsecr but it's optional */
1755 } else if (!got_ts
&& (th
->th_flags
& TH_RST
) == 0 &&
1756 ((src
->state
== TCPS_ESTABLISHED
&& dst
->state
== TCPS_ESTABLISHED
)
1757 || pd
->p_len
> 0 || (th
->th_flags
& TH_SYN
)) &&
1758 src
->scrub
&& dst
->scrub
&&
1759 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
1760 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
1761 /* Didn't send a timestamp. Timestamps aren't really useful
1763 * - connection opening or closing (often not even sent).
1764 * but we must not let an attacker to put a FIN on a
1765 * data packet to sneak it through our ESTABLISHED check.
1766 * - on a TCP reset. RFC suggests not even looking at TS.
1767 * - on an empty ACK. The TS will not be echoed so it will
1768 * probably not help keep the RTT calculation in sync and
1769 * there isn't as much danger when the sequence numbers
1770 * got wrapped. So some stacks don't include TS on empty
1773 * To minimize the disruption to mostly RFC1323 conformant
1774 * stacks, we will only require timestamps on data packets.
1776 * And what do ya know, we cannot require timestamps on data
1777 * packets. There appear to be devices that do legitimate
1778 * TCP connection hijacking. There are HTTP devices that allow
1779 * a 3whs (with timestamps) and then buffer the HTTP request.
1780 * If the intermediate device has the HTTP response cache, it
1781 * will spoof the response but not bother timestamping its
1782 * packets. So we can look for the presence of a timestamp in
1783 * the first data packet and if there, require it in all future
1787 if (pd
->p_len
> 0 && (src
->scrub
->pfss_flags
& PFSS_DATA_TS
)) {
1789 * Hey! Someone tried to sneak a packet in. Or the
1790 * stack changed its RFC1323 behavior?!?!
1792 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
1793 DPFPRINTF(("Did not receive expected RFC1323 "
1795 pf_print_state(state
);
1796 pf_print_flags(th
->th_flags
);
1799 REASON_SET(reason
, PFRES_TS
);
1806 * We will note if a host sends his data packets with or without
1807 * timestamps. And require all data packets to contain a timestamp
1808 * if the first does. PAWS implicitly requires that all data packets be
1809 * timestamped. But I think there are middle-man devices that hijack
1810 * TCP streams immediately after the 3whs and don't timestamp their
1811 * packets (seen in a WWW accelerator or cache).
1813 if (pd
->p_len
> 0 && src
->scrub
&& (src
->scrub
->pfss_flags
&
1814 (PFSS_TIMESTAMP
|PFSS_DATA_TS
|PFSS_DATA_NOTS
)) == PFSS_TIMESTAMP
) {
1816 src
->scrub
->pfss_flags
|= PFSS_DATA_TS
;
1818 src
->scrub
->pfss_flags
|= PFSS_DATA_NOTS
;
1819 if (pf_status
.debug
>= PF_DEBUG_MISC
&& dst
->scrub
&&
1820 (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) {
1821 /* Don't warn if other host rejected RFC1323 */
1822 DPFPRINTF(("Broken RFC1323 stack did not "
1823 "timestamp data packet. Disabled PAWS "
1825 pf_print_state(state
);
1826 pf_print_flags(th
->th_flags
);
1834 * Update PAWS values
1836 if (got_ts
&& src
->scrub
&& PFSS_TIMESTAMP
== (src
->scrub
->pfss_flags
&
1837 (PFSS_PAWS_IDLED
|PFSS_TIMESTAMP
))) {
1838 getmicrouptime(&src
->scrub
->pfss_last
);
1839 if (SEQ_GEQ(tsval
, src
->scrub
->pfss_tsval
) ||
1840 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1841 src
->scrub
->pfss_tsval
= tsval
;
1844 if (SEQ_GEQ(tsecr
, src
->scrub
->pfss_tsecr
) ||
1845 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1846 src
->scrub
->pfss_tsecr
= tsecr
;
1848 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0 &&
1849 (SEQ_LT(tsval
, src
->scrub
->pfss_tsval0
) ||
1850 src
->scrub
->pfss_tsval0
== 0)) {
1851 /* tsval0 MUST be the lowest timestamp */
1852 src
->scrub
->pfss_tsval0
= tsval
;
1855 /* Only fully initialized after a TS gets echoed */
1856 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0)
1857 src
->scrub
->pfss_flags
|= PFSS_PAWS
;
1861 /* I have a dream.... TCP segment reassembly.... */
1866 pf_normalize_tcpopt(struct pf_rule
*r
, struct mbuf
*m
, struct tcphdr
*th
,
1871 int opt
, cnt
, optlen
= 0;
1875 thoff
= th
->th_off
<< 2;
1876 cnt
= thoff
- sizeof(struct tcphdr
);
1877 optp
= mtod(m
, u_char
*) + off
+ sizeof(struct tcphdr
);
1879 for (; cnt
> 0; cnt
-= optlen
, optp
+= optlen
) {
1881 if (opt
== TCPOPT_EOL
)
1883 if (opt
== TCPOPT_NOP
)
1889 if (optlen
< 2 || optlen
> cnt
)
1894 mss
= (u_int16_t
*)(optp
+ 2);
1895 if ((ntohs(*mss
)) > r
->max_mss
) {
1896 th
->th_sum
= pf_cksum_fixup(th
->th_sum
,
1897 *mss
, htons(r
->max_mss
), 0);
1898 *mss
= htons(r
->max_mss
);