blob: be35203f4846cc09bab97eecfc5aa850d9a515e1 [file] [log] [blame]
/*
* Fast path database hash implementation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU FP_ERR( Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Notes:
* Implementation according to Documentation/RCU/rcuref.txt
*/
#define pr_fmt(fmt) "mfp" " database:%s:%d: " fmt, __func__, __LINE__
#include "fp_common.h"
#include "fp_database.h"
#include "fp_device.h"
#include "fp_core.h"
#ifdef CONFIG_ASR_TOE
#include "../linux/drivers/marvell/toev2/toe.h"
#endif
#define FP_ZONE (NF_CT_DEFAULT_ZONE_ID)
#define GUARD_TIMEOUT_SEC (10)
static u32 hash_rnd __read_mostly;
static inline const char *state_to_string(enum entry_state state)
{
return entry_state_names[state];
}
static inline int __fpdb_dump_entry(char *buf, struct fpdb_entry *el)
{
int len = sprintf(buf, "fpdb dump entry (0x%p):\n", el);
len += fp_dump_tuple(buf + len, &el->in_tuple);
len += sprintf(buf + len, "\n");
len += fp_dump_tuple(buf + len, &el->out_tuple);
if (el->hh.hh_len) {
struct ethhdr *eth = (struct ethhdr *)(((u8 *) el->hh.hh_data) +
(HH_DATA_OFF(sizeof(*eth))));
len += sprintf(buf + len, "\nMAC header: src=%pM dst=%pM type=%04x\n",
eth->h_source, eth->h_dest, eth->h_proto);
} else {
len += sprintf(buf + len, "\nMAC header was not set\n");
}
len += sprintf(buf + len, "Interfaces: in %p: %s, out %p: %s\n",
el->in_dev,el->in_dev->dev->name, el->out_dev,el->out_dev->dev->name);
len += sprintf(buf + len, "State: %s hits=%d pointer=%p\n",
state_to_string(el->state), el->hit_counter, el);
len += sprintf(buf + len, "ct info: ct=%p timeout: %x rc=%d\n",
el->ct, el->ct->timeout, atomic_read(&el->rc));
if (debug_level & DBG_INFO)
len += sprintf(buf + len, "DEBUG: (NAT=%s) (route: in=%d out=%d)\n",
NF_CT_NAT(el->ct) ? "YES" : "NO",
el->debug.in_route_type, el->debug.out_route_type);
return len;
}
void fpdb_dump_entry(char *msg, struct fpdb_entry *el)
{
char buf[MAX_DEBUG_PRINT_SIZE];
int len = 0;
BUG_ON(!el);
if (msg)
len = sprintf(buf, "%s", msg);
len += __fpdb_dump_entry(buf + len, el);
pr_err("%s", buf);
}
void fpdb_dump_tuple(char *msg, struct nf_conntrack_tuple *t)
{
char buf[MAX_DEBUG_PRINT_SIZE];
int len = 0;
BUG_ON(!t);
if (msg)
len = sprintf(buf, "%s", msg);
len += sprintf(buf + len, "fpdb dump tuple:\n");
len += fp_dump_tuple(buf + len, t);
pr_err("%s\n", buf);
}
static int fpdb_print_entry(struct fpdb_entry *el, void *data)
{
char in[256], out[256];
unsigned int state, use;
int *first_entry = data;
if (atomic_inc_not_zero(&el->rc)) {
if (*first_entry == true) {
pr_err("l2 l3 l4 timeout\thash\thits\tstate in_dev out_dev tuple_in tuple_out ct block use refcnt\n");
*first_entry = false;
}
__fp_dump_tuple(in, &el->in_tuple, 0);
__fp_dump_tuple(out, &el->out_tuple, 0);
state = el->ct->proto.tcp.state;
use = atomic_read(&el->ct->ct_general.use);
pr_err("%s %s %s %d\t%d\t%d\t%s %s %s %s %s %p %d %d %d\n",
el->hh.hh_len ? "eth" : "NA",
el->in_tuple.src.l3num == AF_INET6 ?
"ipv6" : "ipv4",
el->in_tuple.dst.protonum == IPPROTO_UDP ?
"udp" : "tcp",
jiffies_to_msecs(el->ct->timeout - jiffies) / 1000U,
el->bucket, el->hit_counter,
el->in_tuple.dst.protonum == IPPROTO_UDP ?
"N/A" : tcp_conntrack_names[state],
el->in_dev->dev->name,
el->out_dev->dev->name,
in, out, el->ct, el->block, use, atomic_read(&el->rc));
fpdb_put(el);
}
return 0;
}
void fpdb_dump_db(void)
{
int first_entry = true;
fpdb_iterate(fpdb_print_entry, &first_entry);
}
/****************************************************************************
* Fast Path Database prototypes
****************************************************************************/
struct fpdb_htable {
struct hlist_head *h;
unsigned int size;
int vmalloced;
};
struct fp_database {
struct fpdb_stats stats;
volatile u32 num_entries;
struct fpdb_htable htable;
spinlock_t lock;
struct nf_ct_ext_type *nfct_ext;
struct kmem_cache *db_cache;
};
struct timeout_entry {
struct list_head list;
struct timer_list *timeout;
};
/****************************************************************************
* Fast Path Database globals
****************************************************************************/
static struct fp_database *db;
/* TODO - do we need something else here??
Or is there only one "net" in ESHEL? */
struct net *net = &init_net;
#ifdef CONFIG_ASR_TOE
extern int fp_cm_genl_send_tuple(struct nf_conntrack_tuple *tuple, struct fpdb_entry *el,
int add, int len);
static inline bool get_remote_mac_addr(struct fpdb_entry *el, char *mac)
{
struct neighbour *neigh;
struct neigh_table *tbl;
struct nf_conntrack_tuple *tuple;
struct net_device *br;
if (el->in_dev->br != el->out_dev->br)
return false;
tuple = &el->in_tuple;
br = el->out_dev->br;
tbl = (tuple->src.l3num == AF_INET6) ? &nd_tbl : &arp_tbl;
neigh = neigh_lookup(tbl, tuple->dst.u3.all, br);
if (neigh) {
memcpy(mac, neigh->ha, ETH_ALEN);
neigh_release(neigh);
return true;
}
return false;
}
#endif
static void guard_timer_timeout(struct timer_list *t)
{
struct fpdb_entry *el = from_timer(el, &t, guard_timer);
pr_err("Entry was hold and could not be removed for %d sec. [%px][rc=%d] state=%d\n",
GUARD_TIMEOUT_SEC, el, atomic_read(&el->rc), el->state);
/* BUG_ON(debug_level & DBG_WARN_AS_ERR);*/
if (atomic_read(&el->rc) > 0) {
FP_ERR_DUMP_ENTRY(NULL, el);
pr_err("Extend the timer when rc is not 0!\n");
mod_timer(el->guard_timer, jiffies + GUARD_TIMEOUT_SEC * HZ);
}
}
#ifdef FP_USE_SRAM_POOL_OPT
static void *local_nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
size_t sz;
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)sram_pool_alloc(sz);
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
INIT_HLIST_NULLS_HEAD(&hash[i], i);
return hash;
}
static void local_nf_ct_free_hashtable(void *hash, unsigned int size)
{
sram_pool_free((unsigned long)hash, size * sizeof(struct hlist_nulls_head));
}
#endif
static inline int fpdb_alloc_hashtable(struct fpdb_htable *htable)
{
/* Currently use the same size used by others.. */
htable->size = nf_conntrack_htable_size;
#ifdef FP_USE_SRAM_POOL_OPT
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
htable->h = local_nf_ct_alloc_hashtable(&htable->size, &htable->vmalloced, 0);
#else
htable->h = local_nf_ct_alloc_hashtable(&htable->size, 0);
#endif
#else
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
htable->h = nf_ct_alloc_hashtable(&htable->size, &htable->vmalloced, 0);
#else
htable->h = nf_ct_alloc_hashtable(&htable->size, 0);
#endif
#endif
if (!htable->h)
return -ENOMEM;
pr_debug("allocated fpdb hashtable (size = %d)\n", htable->size);
return 0;
}
static inline void fpdb_free_hashtable(struct fpdb_htable *htable)
{
#ifdef FP_USE_SRAM_POOL_OPT
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
local_nf_ct_free_hashtable(htable->h, htable->vmalloced, htable->size);
#else
local_nf_ct_free_hashtable(htable->h, htable->size);
#endif
#else
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
nf_ct_free_hashtable(htable->h, htable->vmalloced, htable->size);
#else
kvfree(htable->h);
htable->h = NULL;
#endif
#endif
}
/****************************************************************************
* Fast Path Database API
****************************************************************************/
/**
* Allocates and initializes a new database entry
*
* @param flags kmalloc flags
*
* @return new allocated and initialized database entry
*/
struct fpdb_entry *fpdb_alloc(gfp_t flags)
{
struct fpdb_entry *el;
#ifdef FP_USE_SRAM_POOL_OPT
el = (struct fpdb_entry *)sram_pool_alloc(sizeof(struct fpdb_entry));
#else
el = kmem_cache_zalloc(db->db_cache, flags);
#endif
if (!el) {
pr_err("no memory\n");
return NULL;
}
spin_lock_init(&el->lock);
INIT_HLIST_NODE(&el->hlist);
INIT_LIST_HEAD(&el->debug.trace.list);
el->state = ENTRY_INITIALIZED;
#ifdef CONFIG_ASR_TOE
el->nl_flag = 0;
#endif
return el;
}
/**
* Free a database entry
*
* @param flags fpdb_entry * e
*
* @return void
*/
void fpdb_free(struct fpdb_entry * el)
{
fpdev_put(el->out_dev);
fpdev_put(el->in_dev);
#ifdef FP_USE_SRAM_POOL_OPT
sram_pool_free((unsigned long)el, sizeof(struct fpdb_entry));
#else
kmem_cache_free(db->db_cache, el);
#endif
return;
}
/**
* jenkins hash function using the source tuple
*
* @return hash key
*/
static inline unsigned int
fpdb_hash_by_src(const struct nf_conntrack_tuple *tuple)
{
unsigned int hash_src, hash_dst, hash;
BUG_ON(!tuple);
hash_src = jhash_3words((__force u32) tuple->src.u3.ip,
(__force u32) tuple->src.u.all ^ FP_ZONE,
tuple->src.l3num, hash_rnd);
hash_dst = jhash_3words((__force u32) tuple->dst.u3.ip,
(__force u32) tuple->dst.u.all ^ FP_ZONE,
tuple->dst.protonum, hash_rnd);
hash = jhash_2words(hash_src, hash_dst, hash_rnd);
return ((u64)hash * db->htable.size) >> 32;
}
/**
* rcu callback
*
* @param head
*/
static void fpdb_rcu_free(struct rcu_head *head)
{
struct fpdb_entry *el = container_of(rcu_dereference(head),
struct fpdb_entry, rcu);
if (el == NULL) {
pr_err("fpdb_rcu_free el = NULL!\n");
return;
}
BUG_ON(!el || atomic_read(&el->rc) || el->state != ENTRY_DYING);
FP_DEBUG_DUMP_ENTRY("fpdb_rcu_free: entry was deleted\n", el);
if (el->guard_timer) {
del_timer_sync(el->guard_timer);
kfree(el->guard_timer);
el->guard_timer = NULL;
}
spin_lock_bh(&db->lock);
db->num_entries--;
spin_unlock_bh(&db->lock);
fpdev_put(el->out_dev);
fpdev_put(el->in_dev);
#ifdef FP_USE_SRAM_POOL_OPT
sram_pool_free((unsigned long)el, sizeof(struct fpdb_entry));
#else
kmem_cache_free(db->db_cache, el);
#endif
}
/**
* decrement an entry's reference count and delete if 0
*
* @param el pointer to a previously allocated fpdb_entry
*/
void fpdb_put(struct fpdb_entry *el)
{
if (atomic_dec_and_test(&el->rc))
call_rcu(&el->rcu, fpdb_rcu_free);
}
#define FP_SMALL_MEM_LIMIT (64 * 1024 * 1204)
/**
* Adds a previously allocated entry to the database
* and updates its reference count to 1.
*
* @attention el must be allocated first with fpdb_alloc()
* Initial Implementation - Hash by input tuple only
* @param el pointer to a previously allocated fpdb_entry
*
*/
void fpdb_add(struct fpdb_entry *el)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
int pages = totalram_pages();
#else
int pages = totalram_pages;
#endif
unsigned int hash;
u32 max_num;
#ifdef CONFIG_ASR_TOE
char mac[ETH_ALEN];
#endif
spin_lock_bh(&el->lock);
spin_lock_bh(&db->lock);
BUG_ON(!el || !el->out_dev);
BUG_ON(el->state != ENTRY_INITIALIZED);
hash = fpdb_hash_by_src(&el->in_tuple);
atomic_set(&el->rc, 1);
el->state = ENTRY_ALIVE;
el->bucket = hash;
el->tstamp = jiffies;
if (!el->tstamp)
el->tstamp = 1;
BUG_ON(in_irq());
WARN_ON_ONCE(irqs_disabled());
hlist_add_head_rcu(&el->hlist, &db->htable.h[hash]);
db->num_entries++;
#ifdef CONFIG_ASR_TOE
if (get_remote_mac_addr(el, mac))
mfp_toe_add_dmac(el->out_dev->dev, mac);
#endif
spin_unlock_bh(&db->lock);
spin_unlock_bh(&el->lock);
/* Normally Conntrack MAX is HashSize*8. So here is not suit to only check double*/
/*we will modify the code to check 6 times of hash size --Yhuang 20160617*/
if (pages <= (FP_SMALL_MEM_LIMIT >> PAGE_SHIFT))
max_num = 2 * db->htable.size;
else
max_num = 6 * db->htable.size;
if (unlikely(db->num_entries > max_num)) {
pr_err_ratelimited("%s: database overloaded (%d entries, max=%d)\n",
__func__, db->num_entries, max_num);
/*
if (debug_level & DBG_WARN_AS_ERR) {
fpdb_dump_db();
BUG();
}
*/
fpdb_flush();
} else if (unlikely(db->num_entries > ((max_num * 3) / 4))) {
fpdb_del_least_used_entry(max_num);
}
if (db->stats.max_entries < db->num_entries)
db->stats.max_entries = db->num_entries;
FP_DEBUG_DUMP_ENTRY("fpdb_add: entry was added\n", el);
}
/**
* Query the database for an entry matching the input tuple
* and increment the reference count for that entry if found.
*
* @attention The user MUST call fpdb_put() as soon as the entry
* is not used!
*
* @param tuple pointer to a nf_conntrack_tuple
*
* @return pointer to the matching entry, NULL if not found
*/
struct fpdb_entry *fpdb_get(struct nf_conntrack_tuple *tuple)
{
unsigned int hash, iterations = 0;
struct fpdb_entry *el;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
struct hlist_node *h;
#endif
BUG_ON(!tuple);
db->stats.lookups++;
hash = fpdb_hash_by_src(tuple);
rcu_read_lock_bh();
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
hlist_for_each_entry_rcu(el, h, &db->htable.h[hash], hlist) {
#else
hlist_for_each_entry_rcu(el, &db->htable.h[hash], hlist) {
#endif
if (el && nf_ct_tuple_equal(&el->in_tuple, tuple)) {
if (!atomic_inc_not_zero(&el->rc))
goto not_found;
rcu_read_unlock_bh();
if (!iterations)
db->stats.hits++;
el->hit_counter++;
FP_DEBUG_DUMP_ENTRY("fpdb_get: entry was found:\n", el);
return el;
}
iterations++;
db->stats.iterations++; /* Total Iterations*/
}
not_found:
rcu_read_unlock_bh();
FP_DEBUG_DUMP_TUPLE("fpdb_get: entry was not found:\n", tuple);
return NULL;
}
#ifdef CONFIG_ASR_TOE
static int fpdb_del_toe_tuple(struct fpdb_entry *el)
{
struct toe_tuple_buff toe_tuple;
struct fp_net_device *dst, *src;
u32 nat_ip = 0;
u16 nat_port = 0;
u8 proto = 0, in_pkt = 0, out_pkt = 0, fwd = 0, nat = 0;
u8 rx_tx;
BUG_ON(!el);
if (!el->nl_flag)
return 0;
src = rcu_dereference_bh(el->in_dev);
dst = rcu_dereference_bh(el->out_dev);
if (!strncasecmp(src->dev->name, "ccinet", 6))
in_pkt = PDU_PKT;
else if (!strncasecmp(src->dev->name, "usbnet", 6))
in_pkt = USB_PKT;
else if (!strncasecmp(src->dev->name, "wlan", 4))
in_pkt = WIFI_PKT;
else if (!strncasecmp(src->dev->name, "eth", 3))
in_pkt = ETH_PKT;
else
in_pkt = AP_PKT;
if (!strncasecmp(dst->dev->name, "ccinet", 6))
out_pkt = PDU_PKT;
else if (!strncasecmp(dst->dev->name, "usbnet", 6))
out_pkt = USB_PKT;
else if (!strncasecmp(dst->dev->name, "wlan", 4))
out_pkt = WIFI_PKT;
else if (!strncasecmp(dst->dev->name, "eth", 3))
out_pkt = ETH_PKT;
else
out_pkt = AP_PKT;
fwd = (in_pkt != AP_PKT) && (out_pkt != AP_PKT);
if (fwd && (el->out_tuple.src.l3num == AF_INET)) {
if (in_pkt == PDU_PKT && (out_pkt == USB_PKT || out_pkt == WIFI_PKT || out_pkt == ETH_PKT)) {
nat = 1;
nat_ip = ntohl(el->out_tuple.src.u3.ip);
nat_port = ntohs(el->out_tuple.src.u.all);
} else if ((in_pkt == USB_PKT || in_pkt == WIFI_PKT || in_pkt == ETH_PKT) && out_pkt == PDU_PKT) {
nat = 1;
nat_ip = ntohl(el->out_tuple.dst.u3.ip);
nat_port = ntohs(el->out_tuple.dst.u.all);
} else
/* CP TOE WIFI/WIFI TOE CP no need nat */
nat = 0;
}
/* rx: cp -> ap, usb, wifi */
if (in_pkt == PDU_PKT)
rx_tx = 1;
/* rx: ap -> usb, ap -> wifi */
else if ((in_pkt == AP_PKT) && (out_pkt != PDU_PKT))
rx_tx = 1;
/*
* tx:
* ap -> cp
* usb/wifi -> ap/cp */
else
rx_tx = 0;
if (el->in_tuple.src.l3num == AF_INET6) {
memcpy(toe_tuple.src_ip6, el->in_tuple.src.u3.all, sizeof(toe_tuple.src_ip6));
memcpy(toe_tuple.dst_ip6, el->in_tuple.dst.u3.all, sizeof(toe_tuple.src_ip6));
toe_tuple.ip6 = 1;
} else {
toe_tuple.src_ip = ntohl(el->in_tuple.src.u3.ip);
toe_tuple.dst_ip = ntohl(el->in_tuple.dst.u3.ip);
toe_tuple.ip6 = 0;
toe_tuple.nat = nat;
toe_tuple.nat_port = nat_port;
toe_tuple.nat_ip = nat_ip;
}
if (el->in_tuple.dst.protonum == IPPROTO_UDP)
proto = TOE_UDP;
else if (el->in_tuple.dst.protonum == IPPROTO_TCP)
proto = TOE_TCP;
else
proto = TOE_MAX;
toe_tuple.src_port = ntohs(el->in_tuple.src.u.all);
toe_tuple.dst_port = ntohs(el->in_tuple.dst.u.all);
toe_tuple.prot = proto;
toe_tuple.fwd = fwd;
toe_tuple.rxtx = rx_tx;
toe_tuple.out_pkt = out_pkt;
return toe_del_connection(&toe_tuple);
}
#endif
void __fpdb_del(struct fpdb_entry *entry, bool hlist_del)
{
BUG_ON(!entry);
if(entry->state != ENTRY_ALIVE)
return;
entry->state = ENTRY_DYING;
#ifdef CONFIG_ASR_TOE
if (entry->nl_flag) {
fp_cm_genl_send_tuple(&entry->in_tuple, entry, 0, 0);
if (fpdb_del_toe_tuple(entry))
pr_debug("fpdb_del_toe_tuple failed!!!\r\n");
entry->nl_flag = 0;
}
#endif
BUG_ON(entry->guard_timer);
if (hlist_del)
hlist_del_rcu(&entry->hlist);
if (atomic_dec_and_test(&entry->rc)) {
/* move start timer here to avoid rc is not zero yhuang 20160624*/
entry->guard_timer = kmalloc(sizeof(*entry->guard_timer), GFP_ATOMIC);
if (entry->guard_timer) {
timer_setup(entry->guard_timer, guard_timer_timeout, 0);
mod_timer(entry->guard_timer, jiffies + GUARD_TIMEOUT_SEC * HZ);
} else {
pr_err("Guard timer allocation failed!");
}
/* prevent out of order so that guard timer can be stopped */
mb();
call_rcu(&entry->rcu, fpdb_rcu_free);
} else {
pr_err("__fpdb_del fail. entry:%p, rc=%d, state=%d\n", entry,
atomic_read(&entry->rc), entry->state);
}
}
void fpdb_lock_bh(void)
{
return spin_lock_bh(&db->lock);
}
void fpdb_unlock_bh(void)
{
return spin_unlock_bh(&db->lock);
}
void fpdb_del(struct fpdb_entry *entry)
{
spin_lock_bh(&db->lock);
__fpdb_del(entry, true);
spin_unlock_bh(&db->lock);
}
/**
* Replace a previously allocated entry with an prexisting one
* to the database.
*
* @attention nel must be allocated first with fpdb_alloc()
* el - must be already in the database/
* @param el pointer to a previously added fpdb_entry
* @param nel pointer to a newely allocated fpdb_entry
* NOTE: must be called from softirq/lock_bh context
*/
void fpdb_replace(struct fpdb_entry *el, struct fpdb_entry *nel)
{
unsigned int hash;
BUG_ON(!el || !el->out_dev);
BUG_ON(!nel || !nel->out_dev);
BUG_ON(nel->state != ENTRY_INITIALIZED);
hash = fpdb_hash_by_src(&nel->in_tuple);
atomic_set(&nel->rc, 1);
nel->state = ENTRY_ALIVE;
nel->bucket = hash;
BUG_ON(el->bucket != nel->bucket);
db->num_entries++;
hlist_replace_rcu(&el->hlist, &nel->hlist);
__fpdb_del(el, false);
}
static int device_cmp(struct nf_conn *ct, void *dev)
{
struct nf_conn_fastpath *fp = nfct_fastpath(ct);
struct fpdb_entry *orig, *reply;
struct net_device *net = (struct net_device *)dev;
if (!fp)
return 0;
orig = fp->fpd_el[IP_CT_DIR_ORIGINAL];
reply = fp->fpd_el[IP_CT_DIR_REPLY];
if (orig && (fpdev_cmp_if(orig->in_dev, net) ||
fpdev_cmp_if(orig->out_dev, net)))
return 1;
if (reply && (fpdev_cmp_if(reply->in_dev, net) ||
fpdev_cmp_if(reply->out_dev, net)))
return 1;
return 0;
}
static inline bool
tuple_cmp_port(const struct nf_conntrack_tuple *t, unsigned int port)
{
return (ntohs(t->dst.u.all) == port || ntohs(t->src.u.all) == port);
}
static int port_cmp(struct nf_conn *ct, void *ptr)
{
struct nf_conn_fastpath *fp = nfct_fastpath(ct);
struct fpdb_entry *orig, *reply;
unsigned int port = (unsigned int)(unsigned long)ptr;
if (!fp)
return 0;
orig = fp->fpd_el[IP_CT_DIR_ORIGINAL];
reply = fp->fpd_el[IP_CT_DIR_REPLY];
if (orig && (tuple_cmp_port(&orig->in_tuple, port) ||
tuple_cmp_port(&orig->out_tuple, port)))
return 1;
if (reply && (tuple_cmp_port(&reply->in_tuple, port) ||
tuple_cmp_port(&reply->out_tuple, port)))
return 1;
return 0;
}
/* kill all fastpath related conntracks */
static int nf_fp_remove(struct nf_conn *ct, void *data)
{
return test_bit(IPS_FASTPATH_BIT, &ct->status);
}
int fpdb_del_block_entry_by_dev(struct fpdb_entry *el, void *data)
{
struct net_device *dev = (struct net_device *)data;
struct nf_conn_fastpath *ct_fp;
if (fpdev_cmp_if(el->in_dev, dev) ||
fpdev_cmp_if(el->out_dev, dev)) {
spin_lock_bh(&db->lock);
ct_fp = nfct_fastpath(el->ct);
if (ct_fp) {
if (ct_fp->fpd_el[el->dir] == NULL) {
spin_unlock_bh(&db->lock);
return 0;
}
ct_fp->fpd_el[el->dir] = NULL;
}
spin_unlock_bh(&db->lock);
fpdb_del(el);
printk(KERN_DEBUG "delete a block entry related to %s\n", dev->name);
}
return 0;
}
static int nf_fpdb_del(struct nf_conn *ct, void *del)
{
struct nf_conn_fastpath *fp = nfct_fastpath(ct);
struct fpdb_entry *orig, *reply;
if (!fp)
return 0;
orig = fp->fpd_el[IP_CT_DIR_ORIGINAL];
reply = fp->fpd_el[IP_CT_DIR_REPLY];
if (orig && orig == (struct fpdb_entry *)del) {
orig->tstamp = 0;
return 1;
}
if (reply && reply == (struct fpdb_entry *)del) {
reply->tstamp = 0;
return 1;
}
return 0;
}
static int fpdb_find_lest_used_entry(struct fpdb_entry *el, void *data)
{
struct fpdb_entry **p_el = (struct fpdb_entry **)data;
if (!*p_el)
*p_el = el;
else if (el->tstamp && time_before(el->tstamp, (*p_el)->tstamp))
*p_el = el;
return 0;
}
void fpdb_del_least_used_entry(int max_num)
{
struct fpdb_entry *el = NULL;
fpdb_iterate(fpdb_find_lest_used_entry, &el);
if (!el)
return;
pr_info_ratelimited("%s: el=0x%x (%d entries, max=%d)\n",
__func__, (unsigned)el, db->num_entries, max_num);
nf_ct_iterate_cleanup(&nf_fpdb_del, (void *)el, 0, 0);
}
/**
* Remove all fastpath related connections with the specified network device
*
* caller should have rtnl locked
*
* @param dev
*/
void fpdb_del_by_dev(struct net_device *dev)
{
nf_ct_iterate_cleanup(&device_cmp, (void *)dev, 0, 0);
printk(KERN_DEBUG "All entries related to %s deleted\n", dev->name);
}
/**
* Remove all fastpath related connections with the specified port
*
* caller should have rtnl locked
*
* @param port
*/
void fpdb_del_by_port(unsigned int port)
{
nf_ct_iterate_cleanup(&port_cmp, (void *)(unsigned long)port, 0, 0);
pr_debug("All entries with port=%d deleted\n", port);
}
/**
* flush the entire database by cleaning all fastpath related
* conntracks
*
* MUST BE CALLED IN PROCESS CONTEXT
*/
void fpdb_flush(void)
{
nf_ct_iterate_cleanup(&nf_fp_remove, 0, 0, 0);
pr_debug("All entries flushed\n");
}
/**
* Iterate through all fpdb entries
* MUST BE CALLED IN PROCESS CONTEXT
*
* @param iter callback function called per every entry
* If returns 0, iteration stops.
* @param data private data to be passed to the iter callback
*/
void fpdb_iterate(int (*iter)(struct fpdb_entry *e, void *data), void *data)
{
int i;
struct fpdb_entry *e;
for (i = 0; i < db->htable.size; i++) {
rcu_read_lock_bh();
hlist_for_each_entry_rcu(e, &db->htable.h[i], hlist) {
if (iter(e, data))
break;
}
rcu_read_unlock_bh();
}
}
/**
* Add the current entry state to the entry's trace buffer when
* debug_level mask contains DBG_TRACE_LOG
*
* @param entry - entry to log
* @param tcph - NULL for UDP
*/
void fpdb_trace(struct fpdb_entry *entry, struct tcphdr *tcph)
{
if (debug_level & DBG_TRACE_LOG) {
struct fpdb_trace *trace = kzalloc(sizeof(struct fpdb_trace), GFP_ATOMIC);
BUG_ON(!entry);
trace->timeout = jiffies_to_msecs(entry->ct->timeout - jiffies) / 1000U;
trace->ct_status = entry->ct->status;
trace->hit_counter = entry->hit_counter;
if (tcph) {
trace->tcp_state = entry->ct->proto.tcp.state;
trace->tcph = *tcph;
}
list_add(&trace->list, &entry->debug.trace.list);
if (++entry->debug.trace.sz > 5) {
/* TODO - change to configurable param */
trace = list_entry(entry->debug.trace.list.prev, struct fpdb_trace, list);
list_del(entry->debug.trace.list.prev);
kfree(trace);
entry->debug.trace.sz--;
}
}
}
/****************************************************************************
* Fast Path Database private
****************************************************************************/
/* SYS FS and PROC FS */
static void fpdb_get_stats(void)
{
int i, count, max = 0;
struct fpdb_entry *el;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
struct hlist_node *h;
#endif
memset(db->stats.hist, 0, sizeof(db->stats.hist));
db->stats.num_occupied = 0;
for (i = 0; i < db->htable.size; i++) {
count = 0;
rcu_read_lock_bh();
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
hlist_for_each_entry_rcu(el, h, &db->htable.h[i], hlist)
#else
hlist_for_each_entry_rcu(el, &db->htable.h[i], hlist)
#endif
count++;
rcu_read_unlock_bh();
if (count)
db->stats.num_occupied++;
if (count < HISTOGRAM_SIZE) {
db->stats.hist[count].buckets++;
db->stats.hist[count].entries += count;
} else {
db->stats.hist[HISTOGRAM_SIZE].buckets++;
db->stats.hist[HISTOGRAM_SIZE].entries += count;
}
max = (count > max) ? count : max;
}
db->stats.largest_bucket = max;
}
static ssize_t fpdb_sysfs_flush(struct fastpath_module *m, const char *buf, size_t count)
{
struct net_device *dev;
if (count > 2) {
char *str = kmalloc(sizeof(char)*count, GFP_KERNEL);
sprintf(str, "%s", buf);
str[count-1] = '\0';
dev = dev_get_by_name(&init_net, str);
kfree(str);
if (dev) {
fpdb_del_by_dev(dev);
dev_put(dev);
return count;
}
}
fpdb_flush();
return count;
}
static ssize_t fpdb_sysfs_stats_show(struct fastpath_module *m, char *buf)
{
int len, i;
u32 sum_pct = 0;
fpdb_get_stats();
len = sprintf(buf, "Fast Path Database (HASH) statistics:\n");
len += sprintf(buf + len, "Max number of entries: %d ",
db->stats.max_entries);
len += sprintf(buf + len, "Total lookups: %d, Total hits: %d, "
"hit rate %d%%\n", db->stats.lookups, db->stats.hits,
(100 * db->stats.hits) / (db->stats.lookups ?
db->stats.lookups : 1));
len += sprintf(buf + len, "Database Size is %d Buckets\n",
db->htable.size);
len += sprintf(buf + len, "Number of occupied buckets: %d\n",
db->stats.num_occupied);
len += sprintf(buf + len, "Database contains %d entries\n",
db->num_entries);
len += sprintf(buf + len, "Largest bucket contains %d entries\n",
db->stats.largest_bucket);
len += sprintf(buf + len, "Load Factor is %d (%d/%d)\n",
db->num_entries /
(db->htable.size ? db->htable.size : 1),
db->num_entries, db->htable.size);
len += sprintf(buf + len, "find_entry() iterations/lookups: %d/%d\n",
db->stats.iterations, db->stats.lookups);
len += sprintf(buf + len, "Histogram:\n");
len += sprintf(buf + len, "Size buckets entries sum-pct\n");
for (i = 0; i < HISTOGRAM_SIZE; i++) {
if (sum_pct < 100)
sum_pct += (100 * db->stats.hist[i].entries) /
(db->num_entries ?
db->num_entries : 1);
else
sum_pct = 100;
len += sprintf(buf + len, "%4d%10d%10d%10d\n", i,
db->stats.hist[i].buckets,
db->stats.hist[i].entries, sum_pct);
}
len += sprintf(buf + len, ">%3d%10d%10d%10d\n", i - 1,
db->stats.hist[i].buckets,
db->stats.hist[i].entries, 100);
return len;
}
static ssize_t fpdb_sysfs_stats_clear(struct fastpath_module *m, const char *buf,
size_t count)
{
pr_debug("reset stats...\n");
memset(&db->stats, 0, sizeof(db->stats));
return count;
}
static unsigned int dbg_hash;
static ssize_t fpdb_sysfs_entry_debug_select(struct fastpath_module *m, const char *buf,
size_t count)
{
sscanf(buf, "%u", &dbg_hash);
return count;
}
static ssize_t fpdb_sysfs_entry_debug_show(struct fastpath_module *m, char *buf)
{
struct fpdb_entry *el;
int i = 0, len;
struct fpdb_trace *itr;
struct nf_conn_fastpath *fp_ext;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
struct hlist_node *h;
#endif
if (dbg_hash > db->htable.size)
return sprintf(buf, "invalid hash (%d)\n", dbg_hash);
len = sprintf(buf, "debug info for bucket%u:\n", dbg_hash);
rcu_read_lock_bh();
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
hlist_for_each_entry_rcu(el, h, &db->htable.h[dbg_hash], hlist) {
#else
hlist_for_each_entry_rcu(el, &db->htable.h[dbg_hash], hlist) {
#endif
len += __fpdb_dump_entry(buf+len, el);
fp_ext = nf_ct_ext_find(el->ct, NF_CT_EXT_FASTPATH);
BUG_ON(!fp_ext);
len += sprintf(buf+len, "fastpath_ext orig:=%p reply=%p\n",
fp_ext->fpd_el[IP_CT_DIR_ORIGINAL],
fp_ext->fpd_el[IP_CT_DIR_REPLY]);
if (el->in_tuple.dst.protonum == IPPROTO_UDP)
continue;
len += sprintf(buf+len, "%d: trace:\n", i++);
len += sprintf(buf+len, "hits timeout tcp_state tcp_flags ct_status\n");
list_for_each_entry(itr, &el->debug.trace.list, list)
len += sprintf(buf+len, "%d %d %s %c%c%c%c%c%c %lu\n",
itr->hit_counter, itr->timeout,
tcp_conntrack_names[itr->tcp_state],
itr->tcph.urg ? 'U' : '-',
itr->tcph.ack ? 'A' : '-',
itr->tcph.psh ? 'P' : '-',
itr->tcph.rst ? 'R' : '-',
itr->tcph.syn ? 'S' : '-',
itr->tcph.fin ? 'F' : '-',
itr->ct_status);
}
rcu_read_unlock_bh();
return len;
}
static FP_ATTR(stats, S_IRUGO|S_IWUSR, fpdb_sysfs_stats_show, fpdb_sysfs_stats_clear);
static FP_ATTR(flush, S_IWUSR, NULL, fpdb_sysfs_flush);
static FP_ATTR(bucket, S_IRUGO|S_IWUSR, fpdb_sysfs_entry_debug_show, fpdb_sysfs_entry_debug_select);
static struct attribute *fp_database_attrs[] = {
&fp_attr_stats.attr,
&fp_attr_flush.attr,
&fp_attr_bucket.attr,
NULL, /* need to NULL terminate the list of attributes */
};
#ifdef CONFIG_PROC_FS
static bool first;
struct fpdb_iter_state {
struct seq_net_private p;
unsigned int bucket;
};
static struct hlist_node *fpdb_entries_get_first(struct seq_file *seq)
{
struct fpdb_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < db->htable.size; st->bucket++) {
n = rcu_dereference(db->htable.h[st->bucket].first);
if (n) {
first = true;
return n;
}
}
return NULL;
}
static struct hlist_node *fpdb_entries_get_next(struct seq_file *seq,
struct hlist_node *head)
{
struct fpdb_iter_state *st = seq->private;
first = false;
head = rcu_dereference(head->next);
while (head == NULL) {
if (++st->bucket >= db->htable.size)
return NULL;
head = rcu_dereference(db->htable.h[st->bucket].first);
}
return head;
}
static struct hlist_node *fpdb_entries_get_idx(struct seq_file *seq, loff_t pos)
{
struct hlist_node *head = fpdb_entries_get_first(seq);
if (head)
while (pos && (head = fpdb_entries_get_next(seq, head)))
pos--;
return pos ? NULL : head;
}
static void *fpdb_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock_bh();
return fpdb_entries_get_idx(seq, *pos);
}
static void *fpdb_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
(*pos)++;
return fpdb_entries_get_next(seq, v);
}
static void fpdb_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock_bh();
}
static int fpdb_seq_show(struct seq_file *s, void *v)
{
struct hlist_node *n = v;
struct fpdb_entry *el;
char in[256], out[256];
unsigned int state, use;
el = hlist_entry(n, struct fpdb_entry, hlist);
if (atomic_inc_not_zero(&el->rc)) {
if (first == true) {
seq_printf(s, "l2 l3 l4 timeout\thash\thits\tstate in_dev out_dev tuple_in tuple_out ct block use refcnt\n");
}
__fp_dump_tuple(in, &el->in_tuple, 0);
__fp_dump_tuple(out, &el->out_tuple, 0);
state = el->ct->proto.tcp.state;
use = atomic_read(&el->ct->ct_general.use);
seq_printf(s, "%s %s %s %d\t%d\t%d\t%s %s %s %s %s %p %d %d %d"
#ifdef CONFIG_ASR_TOE
" %dKbps"
#endif
"\n",
el->hh.hh_len ? "eth" : "NA",
el->in_tuple.src.l3num == AF_INET6 ?
"ipv6" : "ipv4",
el->in_tuple.dst.protonum == IPPROTO_UDP ?
"udp" : "tcp",
jiffies_to_msecs(el->ct->timeout - jiffies) / 1000U,
el->bucket, el->hit_counter,
el->in_tuple.dst.protonum == IPPROTO_UDP ?
"N/A" : tcp_conntrack_names[state],
el->in_dev->dev->name,
el->out_dev->dev->name,
in, out, el->ct, el->block, use, atomic_read(&el->rc)
#ifdef CONFIG_ASR_TOE
, el->speed
#endif
);
fpdb_put(el);
}
return 0;
}
static const struct seq_operations fpdb_seq_ops = {
.start = fpdb_seq_start,
.next = fpdb_seq_next,
.stop = fpdb_seq_stop,
.show = fpdb_seq_show
};
#endif /* CONFIG_PROC_FS */
static int fpdb_net_init(struct net *net)
{
if (!proc_create_net("fastpath", 0440, net->proc_net, &fpdb_seq_ops,
sizeof(struct fpdb_iter_state)))
return -ENOMEM;
return 0;
}
static void fpdb_net_exit(struct net *net)
{
remove_proc_entry("fastpath", net->proc_net);
}
static struct pernet_operations fpdb_net_ops = {
.init = fpdb_net_init,
.exit = fpdb_net_exit,
};
static void fp_database_release(struct kobject *kobj)
{
struct fastpath_module *module = to_fpmod(kobj);
int wait_time = 200;
fpdb_flush();
do {
/* wait all fpdb freed, then call kmem_cache_destroy */
synchronize_rcu();
msleep(10);
if (--wait_time <= 0)
break;
} while (db->num_entries);
pr_info("%d fpdb entry left\n", db->num_entries);
nf_ct_extend_unregister(db->nfct_ext);
unregister_pernet_subsys(&fpdb_net_ops);
fpdb_free_hashtable(&db->htable);
kmem_cache_destroy(db->db_cache);
#ifdef FP_USE_SRAM_POOL_OPT
sram_pool_free((unsigned long)db, sizeof(struct fp_database));
#else
kfree(db);
#endif
kfree(module);
pr_debug("fp_database released\n");
}
static struct kobj_type ktype_database = {
.sysfs_ops = &fp_sysfs_ops,
.default_attrs = fp_database_attrs,
.release = fp_database_release,
};
static void fpdb_destroy_ext(struct nf_conn *ct)
{
struct nf_conn_fastpath *ct_fp;
struct fpdb_entry *orig, *reply;
BUG_ON(!ct);
spin_lock_bh(&db->lock);
ct_fp = nfct_fastpath(ct);
if (ct_fp) {
orig = ct_fp->fpd_el[IP_CT_DIR_ORIGINAL];
reply = ct_fp->fpd_el[IP_CT_DIR_REPLY];
} else {
orig = NULL;
reply = NULL;
}
if (orig == NULL && reply == NULL) {
spin_unlock_bh(&db->lock);
return;
}
ct_fp->fpd_el[IP_CT_DIR_ORIGINAL] = NULL;
ct_fp->fpd_el[IP_CT_DIR_REPLY] = NULL;
if (orig) {
FP_DEBUG_DUMP_ENTRY("Delete orig entry:\n", orig);
__fpdb_del(orig, true);
}
if (reply) {
FP_DEBUG_DUMP_ENTRY("Delete reply entry:\n", reply);
__fpdb_del(reply, true);
}
spin_unlock_bh(&db->lock);
}
static struct nf_ct_ext_type fpdb_ct_extend = {
.len = sizeof(struct nf_conn_fastpath),
.align = __alignof__(struct nf_conn_fastpath),
.id = NF_CT_EXT_FASTPATH,
.destroy = fpdb_destroy_ext,
};
static int fp_database_probe(struct fastpath_module *module)
{
struct fp_database *priv;
int ret;
#ifdef FP_USE_SRAM_POOL_OPT
priv = (struct fp_database *)sram_pool_alloc(sizeof(struct fp_database));
#else
priv = kzalloc(sizeof(struct fp_database), GFP_KERNEL);
#endif
if (!priv) {
pr_err("no memory\n");
return -ENOMEM;
}
spin_lock_init(&priv->lock);
get_random_bytes(&hash_rnd, sizeof(hash_rnd));
priv->db_cache = kmem_cache_create("fpdb_entry",
sizeof(struct fpdb_entry), 0, SLAB_HWCACHE_ALIGN, NULL);
if (!priv->db_cache) {
pr_err("kmem_cache_create fpdb_entry failed\n");
ret = -ENOMEM;
goto kfree_priv;
}
ret = fpdb_alloc_hashtable(&priv->htable);
if (ret < 0) {
pr_err("fpdb_alloc_hashtable failed (ret=%d)\n", ret);
goto kfree_cache;
}
ret = register_pernet_subsys(&fpdb_net_ops);
if (ret < 0) {
pr_err("cannot register pernet operations (ret=%d)\n", ret);
goto free_hashtable;
}
priv->nfct_ext = &fpdb_ct_extend;
ret = nf_ct_extend_register(priv->nfct_ext);
if (ret < 0) {
pr_err("nf_ct_extend_register failed (%d)\n", ret);
goto unreg_pernet;
}
db = module->priv = priv;
snprintf(module->name, sizeof(module->name), "fp_database");
kobject_init(&module->kobj, &ktype_database);
ret = kobject_add(&module->kobj, module->fastpath->kobj, "%s", module->name);
if (ret < 0) {
pr_err("kobject_add failed (%d)\n", ret);
goto nf_ct_extend_unreg;
}
kobject_uevent(&module->kobj, KOBJ_ADD);
pr_debug("fp_database probed\n");
return 0;
nf_ct_extend_unreg:
kobject_put(&module->kobj);
nf_ct_extend_unregister(priv->nfct_ext);
unreg_pernet:
unregister_pernet_subsys(&fpdb_net_ops);
free_hashtable:
fpdb_free_hashtable(&priv->htable);
kfree_cache:
kmem_cache_destroy(priv->db_cache);
kfree_priv:
#ifdef FP_USE_SRAM_POOL_OPT
sram_pool_free((unsigned long)priv, sizeof(struct fp_database));
#else
kfree(priv);
#endif
return ret;
}
static int fp_database_remove(struct fastpath_module *module)
{
kobject_put(&module->kobj);
pr_debug("fp_database removed\n");
return 0;
}
struct fastpath_module_ops fp_database_ops = {
.probe = fp_database_probe,
.remove = fp_database_remove,
};