#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/tcp.h>
#include <linux/hash.h>
#include <linux/tcp_metrics.h>
#include <linux/vmalloc.h>
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/genetlink.h>
int sysctl_tcp_nometrics_save __read_mostly;
static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
const struct inetpeer_addr *daddr,
struct net *net, unsigned int hash);
struct tcp_fastopen_metrics {
u16 mss;
u16 syn_loss:10; /* Recurring Fast Open SYN losses */
unsigned long last_syn_loss; /* Last Fast Open SYN loss */
struct tcp_fastopen_cookie cookie;
};
/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
* Kernel only stores RTT and RTTVAR in usec resolution
*/
#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
u32 tcpm_ts;
u32 tcpm_ts_stamp;
u32 tcpm_lock;
u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
struct tcp_fastopen_metrics tcpm_fastopen;
struct rcu_head rcu_head;
};
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
return tm->tcpm_lock & (1 << idx);
}
static u32 tcp_metric_get(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
return tm->tcpm_vals[idx];
}
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
tm->tcpm_vals[idx] = val;
}
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
const struct in6_addr *a6, *b6;
if (a->family != b->family)
return false;
if (a->family == AF_INET)
return a->addr.a4 == b->addr.a4;
a6 = (const struct in6_addr *) &a->addr.a6[0];
b6 = (const struct in6_addr *) &b->addr.a6[0];
return ipv6_addr_equal(a6, b6);
}
struct tcpm_hash_bucket {
struct tcp_metrics_block __rcu *chain;
};
static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
bool fastopen_clear)
{
u32 msval;
u32 val;
tm->tcpm_stamp = jiffies;
val = 0;
if (dst_metric_locked(dst, RTAX_RTT))
val |= 1 << TCP_METRIC_RTT;
if (dst_metric_locked(dst, RTAX_RTTVAR))
val |= 1 << TCP_METRIC_RTTVAR;
if (dst_metric_locked(dst, RTAX_SSTHRESH))
val |= 1 << TCP_METRIC_SSTHRESH;
if (dst_metric_locked(dst, RTAX_CWND))
val |= 1 << TCP_METRIC_CWND;
if (dst_metric_locked(dst, RTAX_REORDERING))
val |= 1 << TCP_METRIC_REORDERING;
tm->tcpm_lock = val;
msval = dst_metric_raw(dst, RTAX_RTT);
tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
msval = dst_metric_raw(dst, RTAX_RTTVAR);
tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
tm->tcpm_ts = 0;
tm->tcpm_ts_stamp = 0;
if (fastopen_clear) {
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
tm->tcpm_fastopen.cookie.len = 0;
}
}
#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
{
if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
tcpm_suck_dst(tm, dst, false);
}
#define TCP_METRICS_RECLAIM_DEPTH 5
#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct inetpeer_addr *saddr,
struct inetpeer_addr *daddr,
unsigned int hash)
{
struct tcp_metrics_block *tm;
struct net *net;
bool reclaim = false;
spin_lock_bh(&tcp_metrics_lock);
net = dev_net(dst->dev);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
*/
tm = __tcp_get_metrics(saddr, daddr, net, hash);
if (tm == TCP_METRICS_RECLAIM_PTR) {
reclaim = true;
tm = NULL;
}
if (tm) {
tcpm_check_stamp(tm, dst);
goto out_unlock;
}
if (unlikely(reclaim)) {
struct tcp_metrics_block *oldest;
oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
for (tm = rcu_dereference(oldest->tcpm_next); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
oldest = tm;
}
tm = oldest;
} else {
tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
goto out_unlock;
}
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, true);
if (likely(!reclaim)) {
tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
}
out_unlock:
spin_unlock_bh(&tcp_metrics_lock);
return tm;
}
static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
{
if (tm)
return tm;
if (depth > TCP_METRICS_RECLAIM_DEPTH)
return TCP_METRICS_RECLAIM_PTR;
return NULL;
}
static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
const struct inetpeer_addr *daddr,
struct net *net, unsigned int hash)
{
struct tcp_metrics_block *tm;
int depth = 0;
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, saddr) &&
addr_same(&tm->tcpm_daddr, daddr))
break;
depth++;
}
return tcp_get_encode(tm, depth);
}
static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
struct dst_entry *dst)
{
struct tcp_metrics_block *tm;
struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net;
saddr.family = req->rsk_ops->family;
daddr.family = req->rsk_ops->family;
switch (daddr.family) {
case AF_INET:
saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
hash = (__force unsigned int) daddr.addr.a4;
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
*(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr;
*(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
break;
#endif
default:
return NULL;
}
net = dev_net(dst->dev);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) &&
addr_same(&tm->tcpm_daddr, &daddr))
break;
}
tcpm_check_stamp(tm, dst);
return tm;
}
static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
{
struct tcp_metrics_block *tm;
struct inetpeer_addr saddr, daddr;
unsigned int hash;
struct net *net;
if (tw->tw_family == AF_INET) {
saddr.family = AF_INET;
saddr.addr.a4 = tw->tw_rcv_saddr;
daddr.family = AF_INET;
daddr.addr.a4 = tw->tw_daddr;
hash = (__force unsigned int) daddr.addr.a4;
}
#if IS_ENABLED(CONFIG_IPV6)
else if (tw->tw_family == AF_INET6) {
if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
saddr.family = AF_INET;
saddr.addr.a4 = tw->tw_rcv_saddr;
daddr.family = AF_INET;
daddr.addr.a4 = tw->tw_daddr;
hash = (__force unsigned int) daddr.addr.a4;
} else {
saddr.family = AF_INET6;
*(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr;
daddr.family = AF_INET6;
*(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr;
hash = ipv6_addr_hash(&tw->tw_v6_daddr);
}
}
#endif
else
return NULL;
net = twsk_net(tw);
hash = hash_32(hash, net->ipv4.tcp_m
评论0