[PATCH] [RFC v2] packet: experimental support for 64-bit timestamps

From: Arnd Bergmann
Date: Tue Nov 28 2017 - 08:15:55 EST


This is a second attempt to allow 64-bit timestamps in packet sockets,
this time retaining the existing three API versions, and instead using
flags for the timestamping interface that let us reinterpret the existing
fields.

We already have an interface to select the timestamp source to be either
software or hardware timestamps, using the SOF_TIMESTAMPING_RAW_HARDWARE
flag in the PACKET_TIMESTAMP option that gets passed to setsockopt().

This patch adds another two flags for the same interface:
SOF_TIMESTAMPING_SKIP makes all timestamps zero, which can save a few
cycles per packet when we don't have to touch an expensive clocksource
hardware. When SOF_TIMESTAMPING_OPT_NS64 gets added to the flag, we
reinterpret the tp_sec/tp_nsec fields to be the upper and lower half of
the 64-bit nanosecond value. This lets us use the interface until year
2554, when 64-bit unsigned nanoseconds overflow.

The implementation is fairly straightforward, but I'm less sure about the
interface. Using SOF_TIMESTAMPING_* flags in PACKET_TIMESTAMP is a bit
odd already since most of the other flags make no sense here. Adding two
more flags that only make sense for packet sockets but not the normal
SO_TIMESTAMPING option on other sockets makes this even more confusing.

Link: https://patchwork.kernel.org/patch/10077199/
Suggested-by: Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx>
Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx>
---
This is only a proof-of-concept patch, only compile-tested and put here
for discussion about whether we want it in principle.
---
include/uapi/linux/net_tstamp.h | 4 +-
net/packet/af_packet.c | 129 +++++++++++++++++++++++++---------------
2 files changed, 85 insertions(+), 48 deletions(-)

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 4fe104b2411f..1b312d35fa41 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -30,8 +30,10 @@ enum {
SOF_TIMESTAMPING_OPT_STATS = (1<<12),
SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+ SOF_TIMESTAMPING_SKIP = (1<<15),
+ SOF_TIMESTAMPING_OPT_NS64 = (1<<16),

- SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+ SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_NS64,
SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
SOF_TIMESTAMPING_LAST
};
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7432c6699818..8e40040df967 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -200,7 +200,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
struct packet_sock *, unsigned int status);
static int prb_queue_frozen(struct tpacket_kbdq_core *);
static void prb_open_block(struct tpacket_kbdq_core *,
- struct tpacket_block_desc *);
+ struct tpacket_block_desc *, struct packet_sock *);
static void prb_retire_rx_blk_timer_expired(struct timer_list *);
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
@@ -439,52 +439,92 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
}
}

-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, __u32 *hi, __u32 *lo,
unsigned int flags)
{
+ struct packet_sock *po = pkt_sk(skb->sk);
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+ ktime_t stamp;
+ u32 type;
+
+ if (po->tp_tstamp & SOF_TIMESTAMPING_SKIP)
+ return 0;

if (shhwtstamps &&
- (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
- ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
- return TP_STATUS_TS_RAW_HARDWARE;
+ (po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ shhwtstamps->hwtstamp) {
+ stamp = shhwtstamps->hwtstamp;
+ type = TP_STATUS_TS_RAW_HARDWARE;
+ } else if (skb->tstamp) {
+ stamp = skb->tstamp;
+ type = TP_STATUS_TS_SOFTWARE;
+ } else {
+ return 0;
+ }

- if (ktime_to_timespec64_cond(skb->tstamp, ts))
- return TP_STATUS_TS_SOFTWARE;
+ if (po->tp_tstamp & SOF_TIMESTAMPING_OPT_NS64) {
+ __u64 ns = ktime_to_ns(stamp);

- return 0;
+ *hi = upper_32_bits(ns);
+ *lo = lower_32_bits(ns);
+ } else {
+ struct timespec64 ts = ktime_to_timespec64(stamp);
+
+ *hi = ts.tv_sec;
+ if (po->tp_version == TPACKET_V1)
+ *lo = ts.tv_nsec / NSEC_PER_USEC;
+ else
+ *lo = ts.tv_nsec;
+ }
+
+ return type;
+}
+
+static void packet_get_time(struct packet_sock *po, __u32 *hi, __u32 *lo)
+{
+ if (po->tp_tstamp & SOF_TIMESTAMPING_SKIP) {
+ *hi = 0;
+ *lo = 0;
+ } else if (po->tp_tstamp & SOF_TIMESTAMPING_OPT_NS64) {
+ __u64 ns = ktime_get_real_ns();
+
+ *hi = upper_32_bits(ns);
+ *hi = lower_32_bits(ns);
+ } else {
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ /* unsigned seconds overflow in y2106 here */
+ *hi = ts.tv_sec;
+ if (po->tp_version == TPACKET_V1)
+ *lo = ts.tv_nsec / NSEC_PER_USEC;
+ else
+ *lo = ts.tv_nsec;
+ }
}

static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
struct sk_buff *skb)
{
union tpacket_uhdr h;
- struct timespec64 ts;
- __u32 ts_status;
+ __u32 ts_status, hi, lo;

- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ if (!(ts_status = tpacket_get_timestamp(skb, &hi, &lo, po->tp_tstamp)))
return 0;

h.raw = frame;
- /*
- * versions 1 through 3 overflow the timestamps in y2106, since they
- * all store the seconds in a 32-bit unsigned integer.
- * If we create a version 4, that should have a 64-bit timestamp,
- * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
- * nanoseconds.
- */
switch (po->tp_version) {
case TPACKET_V1:
- h.h1->tp_sec = ts.tv_sec;
- h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ h.h1->tp_sec = hi;
+ h.h1->tp_usec = lo;
break;
case TPACKET_V2:
- h.h2->tp_sec = ts.tv_sec;
- h.h2->tp_nsec = ts.tv_nsec;
+ h.h2->tp_sec = hi;
+ h.h2->tp_nsec = lo;
break;
case TPACKET_V3:
- h.h3->tp_sec = ts.tv_sec;
- h.h3->tp_nsec = ts.tv_nsec;
+ h.h3->tp_sec = hi;
+ h.h3->tp_nsec = lo;
break;
default:
WARN(1, "TPACKET version not supported.\n");
@@ -633,7 +673,7 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
prb_setup_retire_blk_timer(po);
- prb_open_block(p1, pbd);
+ prb_open_block(p1, pbd, po);
}

/* Do NOT update the last_blk_num first.
@@ -730,7 +770,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
* opening a block thaws the queue,restarts timer
* Thawing/timer-refresh is a side effect.
*/
- prb_open_block(pkc, pbd);
+ prb_open_block(pkc, pbd, po);
goto out;
}
}
@@ -812,10 +852,8 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
* It shouldn't really happen as we don't close empty
* blocks. See prb_retire_rx_blk_timer_expired().
*/
- struct timespec64 ts;
- ktime_get_real_ts64(&ts);
- h1->ts_last_pkt.ts_sec = ts.tv_sec;
- h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
+ packet_get_time(po, &h1->ts_last_pkt.ts_sec,
+ &h1->ts_last_pkt.ts_nsec);
}

smp_wmb();
@@ -841,9 +879,8 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
*
*/
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
- struct tpacket_block_desc *pbd1)
+ struct tpacket_block_desc *pbd1, struct packet_sock *po)
{
- struct timespec64 ts;
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;

smp_rmb();
@@ -856,10 +893,8 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
BLOCK_NUM_PKTS(pbd1) = 0;
BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);

- ktime_get_real_ts64(&ts);
-
- h1->ts_first_pkt.ts_sec = ts.tv_sec;
- h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
+ packet_get_time(po, &h1->ts_first_pkt.ts_sec,
+ &h1->ts_first_pkt.ts_nsec);

pkc1->pkblk_start = (char *)pbd1;
pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
@@ -936,7 +971,7 @@ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
* open this block and return the offset where the first packet
* needs to get stored.
*/
- prb_open_block(pkc, pbd);
+ prb_open_block(pkc, pbd, po);
return (void *)pkc->nxt_offset;
}

@@ -1068,7 +1103,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po,
* opening a block also thaws the queue.
* Thawing is a side effect.
*/
- prb_open_block(pkc, pbd);
+ prb_open_block(pkc, pbd, po);
}
}

@@ -2191,8 +2226,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
unsigned long status = TP_STATUS_USER;
unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL;
- struct timespec64 ts;
__u32 ts_status;
+ __u32 hi, lo;
bool is_drop_n_account = false;
bool do_vnet = false;

@@ -2318,8 +2353,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,

skb_copy_bits(skb, 0, h.raw + macoff, snaplen);

- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
- ktime_get_real_ts64(&ts);
+ if (!(ts_status = tpacket_get_timestamp(skb, &hi, &lo, po->tp_tstamp)))
+ packet_get_time(po, &hi, &lo);

status |= ts_status;

@@ -2329,8 +2364,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.h1->tp_snaplen = snaplen;
h.h1->tp_mac = macoff;
h.h1->tp_net = netoff;
- h.h1->tp_sec = ts.tv_sec;
- h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ h.h1->tp_sec = hi;
+ h.h1->tp_usec = lo;
hdrlen = sizeof(*h.h1);
break;
case TPACKET_V2:
@@ -2338,8 +2373,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.h2->tp_snaplen = snaplen;
h.h2->tp_mac = macoff;
h.h2->tp_net = netoff;
- h.h2->tp_sec = ts.tv_sec;
- h.h2->tp_nsec = ts.tv_nsec;
+ h.h2->tp_sec = hi;
+ h.h2->tp_nsec = lo;
if (skb_vlan_tag_present(skb)) {
h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
@@ -2360,8 +2395,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.h3->tp_snaplen = snaplen;
h.h3->tp_mac = macoff;
h.h3->tp_net = netoff;
- h.h3->tp_sec = ts.tv_sec;
- h.h3->tp_nsec = ts.tv_nsec;
+ h.h3->tp_sec = hi;
+ h.h3->tp_nsec = lo;
memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
hdrlen = sizeof(*h.h3);
break;
--
2.9.0