Blame - src/kernel/linux/v4.19/drivers/net/vrf.c - T800

blob: 7f5ee6bb4430063d45a6e83ecbfe95fd5969ba1f [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* vrf.c: device driver to encapsulate a VRF space
				3	*
				4	* Copyright (c) 2015 Cumulus Networks. All rights reserved.
				5	* Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
				6	* Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
				7	*
				8	* Based on dummy, team and ipvlan drivers
				9	*
				10	* This program is free software; you can redistribute it and/or modify
				11	* it under the terms of the GNU General Public License as published by
				12	* the Free Software Foundation; either version 2 of the License, or
				13	* (at your option) any later version.
				14	*/
				15
				16	#include <linux/module.h>
				17	#include <linux/kernel.h>
				18	#include <linux/netdevice.h>
				19	#include <linux/etherdevice.h>
				20	#include <linux/ip.h>
				21	#include <linux/init.h>
				22	#include <linux/moduleparam.h>
				23	#include <linux/netfilter.h>
				24	#include <linux/rtnetlink.h>
				25	#include <net/rtnetlink.h>
				26	#include <linux/u64_stats_sync.h>
				27	#include <linux/hashtable.h>
				28
				29	#include <linux/inetdevice.h>
				30	#include <net/arp.h>
				31	#include <net/ip.h>
				32	#include <net/ip_fib.h>
				33	#include <net/ip6_fib.h>
				34	#include <net/ip6_route.h>
				35	#include <net/route.h>
				36	#include <net/addrconf.h>
				37	#include <net/l3mdev.h>
				38	#include <net/fib_rules.h>
				39	#include <net/netns/generic.h>
				40
				41	#define DRV_NAME "vrf"
				42	#define DRV_VERSION "1.0"
				43
				44	#define FIB_RULE_PREF 1000 /* default preference for FIB rules */
				45
				46	static unsigned int vrf_net_id;
				47
				48	struct net_vrf {
				49	struct rtable __rcu *rth;
				50	struct rt6_info __rcu *rt6;
				51	#if IS_ENABLED(CONFIG_IPV6)
				52	struct fib6_table *fib6_table;
				53	#endif
				54	u32 tb_id;
				55	};
				56
				57	struct pcpu_dstats {
				58	u64 tx_pkts;
				59	u64 tx_bytes;
				60	u64 tx_drps;
				61	u64 rx_pkts;
				62	u64 rx_bytes;
				63	u64 rx_drps;
				64	struct u64_stats_sync syncp;
				65	};
				66
				67	static void vrf_rx_stats(struct net_device *dev, int len)
				68	{
				69	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
				70
				71	u64_stats_update_begin(&dstats->syncp);
				72	dstats->rx_pkts++;
				73	dstats->rx_bytes += len;
				74	u64_stats_update_end(&dstats->syncp);
				75	}
				76
				77	static void vrf_tx_error(struct net_device vrf_dev, struct sk_buff skb)
				78	{
				79	vrf_dev->stats.tx_errors++;
				80	kfree_skb(skb);
				81	}
				82
				83	static void vrf_get_stats64(struct net_device *dev,
				84	struct rtnl_link_stats64 *stats)
				85	{
				86	int i;
				87
				88	for_each_possible_cpu(i) {
				89	const struct pcpu_dstats *dstats;
				90	u64 tbytes, tpkts, tdrops, rbytes, rpkts;
				91	unsigned int start;
				92
				93	dstats = per_cpu_ptr(dev->dstats, i);
				94	do {
				95	start = u64_stats_fetch_begin_irq(&dstats->syncp);
				96	tbytes = dstats->tx_bytes;
				97	tpkts = dstats->tx_pkts;
				98	tdrops = dstats->tx_drps;
				99	rbytes = dstats->rx_bytes;
				100	rpkts = dstats->rx_pkts;
				101	} while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
				102	stats->tx_bytes += tbytes;
				103	stats->tx_packets += tpkts;
				104	stats->tx_dropped += tdrops;
				105	stats->rx_bytes += rbytes;
				106	stats->rx_packets += rpkts;
				107	}
				108	}
				109
				110	/* by default VRF devices do not have a qdisc and are expected
				111	* to be created with only a single queue.
				112	*/
				113	static bool qdisc_tx_is_default(const struct net_device *dev)
				114	{
				115	struct netdev_queue *txq;
				116	struct Qdisc *qdisc;
				117
				118	if (dev->num_tx_queues > 1)
				119	return false;
				120
				121	txq = netdev_get_tx_queue(dev, 0);
				122	qdisc = rcu_access_pointer(txq->qdisc);
				123
				124	return !qdisc->enqueue;
				125	}
				126
				127	/* Local traffic destined to local address. Reinsert the packet to rx
				128	* path, similar to loopback handling.
				129	*/
				130	static int vrf_local_xmit(struct sk_buff skb, struct net_device dev,
				131	struct dst_entry *dst)
				132	{
				133	int len = skb->len;
				134
				135	skb_orphan(skb);
				136
				137	skb_dst_set(skb, dst);
				138
				139	/* set pkt_type to avoid skb hitting packet taps twice -
				140	* once on Tx and again in Rx processing
				141	*/
				142	skb->pkt_type = PACKET_LOOPBACK;
				143
				144	skb->protocol = eth_type_trans(skb, dev);
				145
				146	if (likely(netif_rx(skb) == NET_RX_SUCCESS))
				147	vrf_rx_stats(dev, len);
				148	else
				149	this_cpu_inc(dev->dstats->rx_drps);
				150
				151	return NETDEV_TX_OK;
				152	}
				153
				154	#if IS_ENABLED(CONFIG_IPV6)
				155	static int vrf_ip6_local_out(struct net net, struct sock sk,
				156	struct sk_buff *skb)
				157	{
				158	int err;
				159
				160	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
				161	sk, skb, NULL, skb_dst(skb)->dev, dst_output);
				162
				163	if (likely(err == 1))
				164	err = dst_output(net, sk, skb);
				165
				166	return err;
				167	}
				168
				169	static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
				170	struct net_device *dev)
				171	{
				172	const struct ipv6hdr *iph;
				173	struct net *net = dev_net(skb->dev);
				174	struct flowi6 fl6;
				175	int ret = NET_XMIT_DROP;
				176	struct dst_entry *dst;
				177	struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
				178
				179	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
				180	goto err;
				181
				182	iph = ipv6_hdr(skb);
				183
				184	memset(&fl6, 0, sizeof(fl6));
				185	/* needed to match OIF rule */
				186	fl6.flowi6_oif = dev->ifindex;
				187	fl6.flowi6_iif = LOOPBACK_IFINDEX;
				188	fl6.daddr = iph->daddr;
				189	fl6.saddr = iph->saddr;
				190	fl6.flowlabel = ip6_flowinfo(iph);
				191	fl6.flowi6_mark = skb->mark;
				192	fl6.flowi6_proto = iph->nexthdr;
				193	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
				194
				195	dst = ip6_route_output(net, NULL, &fl6);
				196	if (dst == dst_null)
				197	goto err;
				198
				199	skb_dst_drop(skb);
				200
				201	/* if dst.dev is loopback or the VRF device again this is locally
				202	* originated traffic destined to a local address. Short circuit
				203	* to Rx path
				204	*/
				205	if (dst->dev == dev)
				206	return vrf_local_xmit(skb, dev, dst);
				207
				208	skb_dst_set(skb, dst);
				209
				210	/* strip the ethernet header added for pass through VRF device */
				211	__skb_pull(skb, skb_network_offset(skb));
				212
				213	ret = vrf_ip6_local_out(net, skb->sk, skb);
				214	if (unlikely(net_xmit_eval(ret)))
				215	dev->stats.tx_errors++;
				216	else
				217	ret = NET_XMIT_SUCCESS;
				218
				219	return ret;
				220	err:
				221	vrf_tx_error(dev, skb);
				222	return NET_XMIT_DROP;
				223	}
				224	#else
				225	static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
				226	struct net_device *dev)
				227	{
				228	vrf_tx_error(dev, skb);
				229	return NET_XMIT_DROP;
				230	}
				231	#endif
				232
				233	/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
				234	static int vrf_ip_local_out(struct net net, struct sock sk,
				235	struct sk_buff *skb)
				236	{
				237	int err;
				238
				239	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
				240	skb, NULL, skb_dst(skb)->dev, dst_output);
				241	if (likely(err == 1))
				242	err = dst_output(net, sk, skb);
				243
				244	return err;
				245	}
				246
				247	static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
				248	struct net_device *vrf_dev)
				249	{
				250	struct iphdr *ip4h;
				251	int ret = NET_XMIT_DROP;
				252	struct flowi4 fl4;
				253	struct net *net = dev_net(vrf_dev);
				254	struct rtable *rt;
				255
				256	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
				257	goto err;
				258
				259	ip4h = ip_hdr(skb);
				260
				261	memset(&fl4, 0, sizeof(fl4));
				262	/* needed to match OIF rule */
				263	fl4.flowi4_oif = vrf_dev->ifindex;
				264	fl4.flowi4_iif = LOOPBACK_IFINDEX;
				265	fl4.flowi4_tos = RT_TOS(ip4h->tos);
				266	fl4.flowi4_flags = FLOWI_FLAG_ANYSRC \| FLOWI_FLAG_SKIP_NH_OIF;
				267	fl4.flowi4_proto = ip4h->protocol;
				268	fl4.daddr = ip4h->daddr;
				269	fl4.saddr = ip4h->saddr;
				270
				271	rt = ip_route_output_flow(net, &fl4, NULL);
				272	if (IS_ERR(rt))
				273	goto err;
				274
				275	skb_dst_drop(skb);
				276
				277	/* if dst.dev is loopback or the VRF device again this is locally
				278	* originated traffic destined to a local address. Short circuit
				279	* to Rx path
				280	*/
				281	if (rt->dst.dev == vrf_dev)
				282	return vrf_local_xmit(skb, vrf_dev, &rt->dst);
				283
				284	skb_dst_set(skb, &rt->dst);
				285
				286	/* strip the ethernet header added for pass through VRF device */
				287	__skb_pull(skb, skb_network_offset(skb));
				288
				289	if (!ip4h->saddr) {
				290	ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
				291	RT_SCOPE_LINK);
				292	}
				293
				294	ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
				295	if (unlikely(net_xmit_eval(ret)))
				296	vrf_dev->stats.tx_errors++;
				297	else
				298	ret = NET_XMIT_SUCCESS;
				299
				300	out:
				301	return ret;
				302	err:
				303	vrf_tx_error(vrf_dev, skb);
				304	goto out;
				305	}
				306
				307	static netdev_tx_t is_ip_tx_frame(struct sk_buff skb, struct net_device dev)
				308	{
				309	switch (skb->protocol) {
				310	case htons(ETH_P_IP):
				311	return vrf_process_v4_outbound(skb, dev);
				312	case htons(ETH_P_IPV6):
				313	return vrf_process_v6_outbound(skb, dev);
				314	default:
				315	vrf_tx_error(dev, skb);
				316	return NET_XMIT_DROP;
				317	}
				318	}
				319
				320	static netdev_tx_t vrf_xmit(struct sk_buff skb, struct net_device dev)
				321	{
				322	int len = skb->len;
				323	netdev_tx_t ret = is_ip_tx_frame(skb, dev);
				324
				325	if (likely(ret == NET_XMIT_SUCCESS \|\| ret == NET_XMIT_CN)) {
				326	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
				327
				328	u64_stats_update_begin(&dstats->syncp);
				329	dstats->tx_pkts++;
				330	dstats->tx_bytes += len;
				331	u64_stats_update_end(&dstats->syncp);
				332	} else {
				333	this_cpu_inc(dev->dstats->tx_drps);
				334	}
				335
				336	return ret;
				337	}
				338
				339	static int vrf_finish_direct(struct net net, struct sock sk,
				340	struct sk_buff *skb)
				341	{
				342	struct net_device *vrf_dev = skb->dev;
				343
				344	if (!list_empty(&vrf_dev->ptype_all) &&
				345	likely(skb_headroom(skb) >= ETH_HLEN)) {
				346	struct ethhdr *eth = skb_push(skb, ETH_HLEN);
				347
				348	ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
				349	eth_zero_addr(eth->h_dest);
				350	eth->h_proto = skb->protocol;
				351
				352	rcu_read_lock_bh();
				353	dev_queue_xmit_nit(skb, vrf_dev);
				354	rcu_read_unlock_bh();
				355
				356	skb_pull(skb, ETH_HLEN);
				357	}
				358
				359	return 1;
				360	}
				361
				362	#if IS_ENABLED(CONFIG_IPV6)
				363	/* modelled after ip6_finish_output2 */
				364	static int vrf_finish_output6(struct net net, struct sock sk,
				365	struct sk_buff *skb)
				366	{
				367	struct dst_entry *dst = skb_dst(skb);
				368	struct net_device *dev = dst->dev;
				369	struct neighbour *neigh;
				370	struct in6_addr *nexthop;
				371	int ret;
				372
				373	nf_reset(skb);
				374
				375	skb->protocol = htons(ETH_P_IPV6);
				376	skb->dev = dev;
				377
				378	rcu_read_lock_bh();
				379	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
				380	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
				381	if (unlikely(!neigh))
				382	neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
				383	if (!IS_ERR(neigh)) {
				384	sock_confirm_neigh(skb, neigh);
				385	ret = neigh_output(neigh, skb);
				386	rcu_read_unlock_bh();
				387	return ret;
				388	}
				389	rcu_read_unlock_bh();
				390
				391	IP6_INC_STATS(dev_net(dst->dev),
				392	ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
				393	kfree_skb(skb);
				394	return -EINVAL;
				395	}
				396
				397	/* modelled after ip6_output */
				398	static int vrf_output6(struct net net, struct sock sk, struct sk_buff *skb)
				399	{
				400	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
				401	net, sk, skb, NULL, skb_dst(skb)->dev,
				402	vrf_finish_output6,
				403	!(IP6CB(skb)->flags & IP6SKB_REROUTED));
				404	}
				405
				406	/* set dst on skb to send packet to us via dev_xmit path. Allows
				407	* packet to go through device based features such as qdisc, netfilter
				408	* hooks and packet sockets with skb->dev set to vrf device.
				409	*/
				410	static struct sk_buff vrf_ip6_out_redirect(struct net_device vrf_dev,
				411	struct sk_buff *skb)
				412	{
				413	struct net_vrf *vrf = netdev_priv(vrf_dev);
				414	struct dst_entry *dst = NULL;
				415	struct rt6_info *rt6;
				416
				417	rcu_read_lock();
				418
				419	rt6 = rcu_dereference(vrf->rt6);
				420	if (likely(rt6)) {
				421	dst = &rt6->dst;
				422	dst_hold(dst);
				423	}
				424
				425	rcu_read_unlock();
				426
				427	if (unlikely(!dst)) {
				428	vrf_tx_error(vrf_dev, skb);
				429	return NULL;
				430	}
				431
				432	skb_dst_drop(skb);
				433	skb_dst_set(skb, dst);
				434
				435	return skb;
				436	}
				437
				438	static int vrf_output6_direct(struct net net, struct sock sk,
				439	struct sk_buff *skb)
				440	{
				441	skb->protocol = htons(ETH_P_IPV6);
				442
				443	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
				444	net, sk, skb, NULL, skb->dev,
				445	vrf_finish_direct,
				446	!(IPCB(skb)->flags & IPSKB_REROUTED));
				447	}
				448
				449	static struct sk_buff vrf_ip6_out_direct(struct net_device vrf_dev,
				450	struct sock *sk,
				451	struct sk_buff *skb)
				452	{
				453	struct net *net = dev_net(vrf_dev);
				454	int err;
				455
				456	skb->dev = vrf_dev;
				457
				458	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
				459	skb, NULL, vrf_dev, vrf_output6_direct);
				460
				461	if (likely(err == 1))
				462	err = vrf_output6_direct(net, sk, skb);
				463
				464	/* reset skb device */
				465	if (likely(err == 1))
				466	nf_reset(skb);
				467	else
				468	skb = NULL;
				469
				470	return skb;
				471	}
				472
				473	static struct sk_buff vrf_ip6_out(struct net_device vrf_dev,
				474	struct sock *sk,
				475	struct sk_buff *skb)
				476	{
				477	/* don't divert link scope packets */
				478	if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
				479	return skb;
				480
				481	if (qdisc_tx_is_default(vrf_dev))
				482	return vrf_ip6_out_direct(vrf_dev, sk, skb);
				483
				484	return vrf_ip6_out_redirect(vrf_dev, skb);
				485	}
				486
				487	/* holding rtnl */
				488	static void vrf_rt6_release(struct net_device dev, struct net_vrf vrf)
				489	{
				490	struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
				491	struct net *net = dev_net(dev);
				492	struct dst_entry *dst;
				493
				494	RCU_INIT_POINTER(vrf->rt6, NULL);
				495	synchronize_rcu();
				496
				497	/* move dev in dst's to loopback so this VRF device can be deleted
				498	* - based on dst_ifdown
				499	*/
				500	if (rt6) {
				501	dst = &rt6->dst;
				502	dev_put(dst->dev);
				503	dst->dev = net->loopback_dev;
				504	dev_hold(dst->dev);
				505	dst_release(dst);
				506	}
				507	}
				508
				509	static int vrf_rt6_create(struct net_device *dev)
				510	{
				511	int flags = DST_HOST \| DST_NOPOLICY \| DST_NOXFRM;
				512	struct net_vrf *vrf = netdev_priv(dev);
				513	struct net *net = dev_net(dev);
				514	struct rt6_info *rt6;
				515	int rc = -ENOMEM;
				516
				517	/* IPv6 can be CONFIG enabled and then disabled runtime */
				518	if (!ipv6_mod_enabled())
				519	return 0;
				520
				521	vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
				522	if (!vrf->fib6_table)
				523	goto out;
				524
				525	/* create a dst for routing packets out a VRF device */
				526	rt6 = ip6_dst_alloc(net, dev, flags);
				527	if (!rt6)
				528	goto out;
				529
				530	rt6->dst.output = vrf_output6;
				531
				532	rcu_assign_pointer(vrf->rt6, rt6);
				533
				534	rc = 0;
				535	out:
				536	return rc;
				537	}
				538	#else
				539	static struct sk_buff vrf_ip6_out(struct net_device vrf_dev,
				540	struct sock *sk,
				541	struct sk_buff *skb)
				542	{
				543	return skb;
				544	}
				545
				546	static void vrf_rt6_release(struct net_device dev, struct net_vrf vrf)
				547	{
				548	}
				549
				550	static int vrf_rt6_create(struct net_device *dev)
				551	{
				552	return 0;
				553	}
				554	#endif
				555
				556	/* modelled after ip_finish_output2 */
				557	static int vrf_finish_output(struct net net, struct sock sk, struct sk_buff *skb)
				558	{
				559	struct dst_entry *dst = skb_dst(skb);
				560	struct rtable rt = (struct rtable )dst;
				561	struct net_device *dev = dst->dev;
				562	unsigned int hh_len = LL_RESERVED_SPACE(dev);
				563	struct neighbour *neigh;
				564	u32 nexthop;
				565	int ret = -EINVAL;
				566
				567	nf_reset(skb);
				568
				569	/* Be paranoid, rather than too clever. */
				570	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
				571	struct sk_buff *skb2;
				572
				573	skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
				574	if (!skb2) {
				575	ret = -ENOMEM;
				576	goto err;
				577	}
				578	if (skb->sk)
				579	skb_set_owner_w(skb2, skb->sk);
				580
				581	consume_skb(skb);
				582	skb = skb2;
				583	}
				584
				585	rcu_read_lock_bh();
				586
				587	nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
				588	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
				589	if (unlikely(!neigh))
				590	neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
				591	if (!IS_ERR(neigh)) {
				592	sock_confirm_neigh(skb, neigh);
				593	ret = neigh_output(neigh, skb);
				594	rcu_read_unlock_bh();
				595	return ret;
				596	}
				597
				598	rcu_read_unlock_bh();
				599	err:
				600	vrf_tx_error(skb->dev, skb);
				601	return ret;
				602	}
				603
				604	static int vrf_output(struct net net, struct sock sk, struct sk_buff *skb)
				605	{
				606	struct net_device *dev = skb_dst(skb)->dev;
				607
				608	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
				609
				610	skb->dev = dev;
				611	skb->protocol = htons(ETH_P_IP);
				612
				613	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				614	net, sk, skb, NULL, dev,
				615	vrf_finish_output,
				616	!(IPCB(skb)->flags & IPSKB_REROUTED));
				617	}
				618
				619	/* set dst on skb to send packet to us via dev_xmit path. Allows
				620	* packet to go through device based features such as qdisc, netfilter
				621	* hooks and packet sockets with skb->dev set to vrf device.
				622	*/
				623	static struct sk_buff vrf_ip_out_redirect(struct net_device vrf_dev,
				624	struct sk_buff *skb)
				625	{
				626	struct net_vrf *vrf = netdev_priv(vrf_dev);
				627	struct dst_entry *dst = NULL;
				628	struct rtable *rth;
				629
				630	rcu_read_lock();
				631
				632	rth = rcu_dereference(vrf->rth);
				633	if (likely(rth)) {
				634	dst = &rth->dst;
				635	dst_hold(dst);
				636	}
				637
				638	rcu_read_unlock();
				639
				640	if (unlikely(!dst)) {
				641	vrf_tx_error(vrf_dev, skb);
				642	return NULL;
				643	}
				644
				645	skb_dst_drop(skb);
				646	skb_dst_set(skb, dst);
				647
				648	return skb;
				649	}
				650
				651	static int vrf_output_direct(struct net net, struct sock sk,
				652	struct sk_buff *skb)
				653	{
				654	skb->protocol = htons(ETH_P_IP);
				655
				656	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				657	net, sk, skb, NULL, skb->dev,
				658	vrf_finish_direct,
				659	!(IPCB(skb)->flags & IPSKB_REROUTED));
				660	}
				661
				662	static struct sk_buff vrf_ip_out_direct(struct net_device vrf_dev,
				663	struct sock *sk,
				664	struct sk_buff *skb)
				665	{
				666	struct net *net = dev_net(vrf_dev);
				667	int err;
				668
				669	skb->dev = vrf_dev;
				670
				671	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
				672	skb, NULL, vrf_dev, vrf_output_direct);
				673
				674	if (likely(err == 1))
				675	err = vrf_output_direct(net, sk, skb);
				676
				677	/* reset skb device */
				678	if (likely(err == 1))
				679	nf_reset(skb);
				680	else
				681	skb = NULL;
				682
				683	return skb;
				684	}
				685
				686	static struct sk_buff vrf_ip_out(struct net_device vrf_dev,
				687	struct sock *sk,
				688	struct sk_buff *skb)
				689	{
				690	/* don't divert multicast or local broadcast */
				691	if (ipv4_is_multicast(ip_hdr(skb)->daddr) \|\|
				692	ipv4_is_lbcast(ip_hdr(skb)->daddr))
				693	return skb;
				694
				695	if (qdisc_tx_is_default(vrf_dev))
				696	return vrf_ip_out_direct(vrf_dev, sk, skb);
				697
				698	return vrf_ip_out_redirect(vrf_dev, skb);
				699	}
				700
				701	/* called with rcu lock held */
				702	static struct sk_buff vrf_l3_out(struct net_device vrf_dev,
				703	struct sock *sk,
				704	struct sk_buff *skb,
				705	u16 proto)
				706	{
				707	switch (proto) {
				708	case AF_INET:
				709	return vrf_ip_out(vrf_dev, sk, skb);
				710	case AF_INET6:
				711	return vrf_ip6_out(vrf_dev, sk, skb);
				712	}
				713
				714	return skb;
				715	}
				716
				717	/* holding rtnl */
				718	static void vrf_rtable_release(struct net_device dev, struct net_vrf vrf)
				719	{
				720	struct rtable *rth = rtnl_dereference(vrf->rth);
				721	struct net *net = dev_net(dev);
				722	struct dst_entry *dst;
				723
				724	RCU_INIT_POINTER(vrf->rth, NULL);
				725	synchronize_rcu();
				726
				727	/* move dev in dst's to loopback so this VRF device can be deleted
				728	* - based on dst_ifdown
				729	*/
				730	if (rth) {
				731	dst = &rth->dst;
				732	dev_put(dst->dev);
				733	dst->dev = net->loopback_dev;
				734	dev_hold(dst->dev);
				735	dst_release(dst);
				736	}
				737	}
				738
				739	static int vrf_rtable_create(struct net_device *dev)
				740	{
				741	struct net_vrf *vrf = netdev_priv(dev);
				742	struct rtable *rth;
				743
				744	if (!fib_new_table(dev_net(dev), vrf->tb_id))
				745	return -ENOMEM;
				746
				747	/* create a dst for routing packets out through a VRF device */
				748	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
				749	if (!rth)
				750	return -ENOMEM;
				751
				752	rth->dst.output = vrf_output;
				753
				754	rcu_assign_pointer(vrf->rth, rth);
				755
				756	return 0;
				757	}
				758
				759	/************************** device handling ******************/
				760
				761	/* cycle interface to flush neighbor cache and move routes across tables */
				762	static void cycle_netdev(struct net_device *dev)
				763	{
				764	unsigned int flags = dev->flags;
				765	int ret;
				766
				767	if (!netif_running(dev))
				768	return;
				769
				770	ret = dev_change_flags(dev, flags & ~IFF_UP);
				771	if (ret >= 0)
				772	ret = dev_change_flags(dev, flags);
				773
				774	if (ret < 0) {
				775	netdev_err(dev,
				776	"Failed to cycle device %s; route tables might be wrong!\n",
				777	dev->name);
				778	}
				779	}
				780
				781	static int do_vrf_add_slave(struct net_device dev, struct net_device port_dev,
				782	struct netlink_ext_ack *extack)
				783	{
				784	int ret;
				785
				786	/* do not allow loopback device to be enslaved to a VRF.
				787	* The vrf device acts as the loopback for the vrf.
				788	*/
				789	if (port_dev == dev_net(dev)->loopback_dev) {
				790	NL_SET_ERR_MSG(extack,
				791	"Can not enslave loopback device to a VRF");
				792	return -EOPNOTSUPP;
				793	}
				794
				795	port_dev->priv_flags \|= IFF_L3MDEV_SLAVE;
				796	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
				797	if (ret < 0)
				798	goto err;
				799
				800	cycle_netdev(port_dev);
				801
				802	return 0;
				803
				804	err:
				805	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
				806	return ret;
				807	}
				808
				809	static int vrf_add_slave(struct net_device dev, struct net_device port_dev,
				810	struct netlink_ext_ack *extack)
				811	{
				812	if (netif_is_l3_master(port_dev)) {
				813	NL_SET_ERR_MSG(extack,
				814	"Can not enslave an L3 master device to a VRF");
				815	return -EINVAL;
				816	}
				817
				818	if (netif_is_l3_slave(port_dev))
				819	return -EINVAL;
				820
				821	return do_vrf_add_slave(dev, port_dev, extack);
				822	}
				823
				824	/* inverse of do_vrf_add_slave */
				825	static int do_vrf_del_slave(struct net_device dev, struct net_device port_dev)
				826	{
				827	netdev_upper_dev_unlink(port_dev, dev);
				828	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
				829
				830	cycle_netdev(port_dev);
				831
				832	return 0;
				833	}
				834
				835	static int vrf_del_slave(struct net_device dev, struct net_device port_dev)
				836	{
				837	return do_vrf_del_slave(dev, port_dev);
				838	}
				839
				840	static void vrf_dev_uninit(struct net_device *dev)
				841	{
				842	struct net_vrf *vrf = netdev_priv(dev);
				843
				844	vrf_rtable_release(dev, vrf);
				845	vrf_rt6_release(dev, vrf);
				846
				847	free_percpu(dev->dstats);
				848	dev->dstats = NULL;
				849	}
				850
				851	static int vrf_dev_init(struct net_device *dev)
				852	{
				853	struct net_vrf *vrf = netdev_priv(dev);
				854
				855	dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
				856	if (!dev->dstats)
				857	goto out_nomem;
				858
				859	/* create the default dst which points back to us */
				860	if (vrf_rtable_create(dev) != 0)
				861	goto out_stats;
				862
				863	if (vrf_rt6_create(dev) != 0)
				864	goto out_rth;
				865
				866	dev->flags = IFF_MASTER \| IFF_NOARP;
				867
				868	/* MTU is irrelevant for VRF device; set to 64k similar to lo */
				869	dev->mtu = 64 * 1024;
				870
				871	/* similarly, oper state is irrelevant; set to up to avoid confusion */
				872	dev->operstate = IF_OPER_UP;
				873	netdev_lockdep_set_classes(dev);
				874	return 0;
				875
				876	out_rth:
				877	vrf_rtable_release(dev, vrf);
				878	out_stats:
				879	free_percpu(dev->dstats);
				880	dev->dstats = NULL;
				881	out_nomem:
				882	return -ENOMEM;
				883	}
				884
				885	static const struct net_device_ops vrf_netdev_ops = {
				886	.ndo_init = vrf_dev_init,
				887	.ndo_uninit = vrf_dev_uninit,
				888	.ndo_start_xmit = vrf_xmit,
				889	.ndo_get_stats64 = vrf_get_stats64,
				890	.ndo_add_slave = vrf_add_slave,
				891	.ndo_del_slave = vrf_del_slave,
				892	};
				893
				894	static u32 vrf_fib_table(const struct net_device *dev)
				895	{
				896	struct net_vrf *vrf = netdev_priv(dev);
				897
				898	return vrf->tb_id;
				899	}
				900
				901	static int vrf_rcv_finish(struct net net, struct sock sk, struct sk_buff *skb)
				902	{
				903	kfree_skb(skb);
				904	return 0;
				905	}
				906
				907	static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
				908	struct sk_buff *skb,
				909	struct net_device *dev)
				910	{
				911	struct net *net = dev_net(dev);
				912
				913	if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
				914	skb = NULL; /* kfree_skb(skb) handled by nf code */
				915
				916	return skb;
				917	}
				918
				919	#if IS_ENABLED(CONFIG_IPV6)
				920	/* neighbor handling is done with actual device; do not want
				921	* to flip skb->dev for those ndisc packets. This really fails
				922	* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
				923	* a start.
				924	*/
				925	static bool ipv6_ndisc_frame(const struct sk_buff *skb)
				926	{
				927	const struct ipv6hdr *iph = ipv6_hdr(skb);
				928	bool rc = false;
				929
				930	if (iph->nexthdr == NEXTHDR_ICMP) {
				931	const struct icmp6hdr *icmph;
				932	struct icmp6hdr _icmph;
				933
				934	icmph = skb_header_pointer(skb, sizeof(*iph),
				935	sizeof(_icmph), &_icmph);
				936	if (!icmph)
				937	goto out;
				938
				939	switch (icmph->icmp6_type) {
				940	case NDISC_ROUTER_SOLICITATION:
				941	case NDISC_ROUTER_ADVERTISEMENT:
				942	case NDISC_NEIGHBOUR_SOLICITATION:
				943	case NDISC_NEIGHBOUR_ADVERTISEMENT:
				944	case NDISC_REDIRECT:
				945	rc = true;
				946	break;
				947	}
				948	}
				949
				950	out:
				951	return rc;
				952	}
				953
				954	static struct rt6_info vrf_ip6_route_lookup(struct net net,
				955	const struct net_device *dev,
				956	struct flowi6 *fl6,
				957	int ifindex,
				958	const struct sk_buff *skb,
				959	int flags)
				960	{
				961	struct net_vrf *vrf = netdev_priv(dev);
				962
				963	return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
				964	}
				965
				966	static void vrf_ip6_input_dst(struct sk_buff skb, struct net_device vrf_dev,
				967	int ifindex)
				968	{
				969	const struct ipv6hdr *iph = ipv6_hdr(skb);
				970	struct flowi6 fl6 = {
				971	.flowi6_iif = ifindex,
				972	.flowi6_mark = skb->mark,
				973	.flowi6_proto = iph->nexthdr,
				974	.daddr = iph->daddr,
				975	.saddr = iph->saddr,
				976	.flowlabel = ip6_flowinfo(iph),
				977	};
				978	struct net *net = dev_net(vrf_dev);
				979	struct rt6_info *rt6;
				980
				981	rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb,
				982	RT6_LOOKUP_F_HAS_SADDR \| RT6_LOOKUP_F_IFACE);
				983	if (unlikely(!rt6))
				984	return;
				985
				986	if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
				987	return;
				988
				989	skb_dst_set(skb, &rt6->dst);
				990	}
				991
				992	static struct sk_buff vrf_ip6_rcv(struct net_device vrf_dev,
				993	struct sk_buff *skb)
				994	{
				995	int orig_iif = skb->skb_iif;
				996	bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
				997	bool is_ndisc = ipv6_ndisc_frame(skb);
				998
				999	/* loopback, multicast & non-ND link-local traffic; do not push through
				1000	* packet taps again. Reset pkt_type for upper layers to process skb
				1001	*/
				1002	if (skb->pkt_type == PACKET_LOOPBACK \|\| (need_strict && !is_ndisc)) {
				1003	skb->dev = vrf_dev;
				1004	skb->skb_iif = vrf_dev->ifindex;
				1005	IP6CB(skb)->flags \|= IP6SKB_L3SLAVE;
				1006	if (skb->pkt_type == PACKET_LOOPBACK)
				1007	skb->pkt_type = PACKET_HOST;
				1008	goto out;
				1009	}
				1010
				1011	/* if packet is NDISC then keep the ingress interface */
				1012	if (!is_ndisc) {
				1013	vrf_rx_stats(vrf_dev, skb->len);
				1014	skb->dev = vrf_dev;
				1015	skb->skb_iif = vrf_dev->ifindex;
				1016
				1017	if (!list_empty(&vrf_dev->ptype_all)) {
				1018	skb_push(skb, skb->mac_len);
				1019	dev_queue_xmit_nit(skb, vrf_dev);
				1020	skb_pull(skb, skb->mac_len);
				1021	}
				1022
				1023	IP6CB(skb)->flags \|= IP6SKB_L3SLAVE;
				1024	}
				1025
				1026	if (need_strict)
				1027	vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
				1028
				1029	skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev);
				1030	out:
				1031	return skb;
				1032	}
				1033
				1034	#else
				1035	static struct sk_buff vrf_ip6_rcv(struct net_device vrf_dev,
				1036	struct sk_buff *skb)
				1037	{
				1038	return skb;
				1039	}
				1040	#endif
				1041
				1042	static struct sk_buff vrf_ip_rcv(struct net_device vrf_dev,
				1043	struct sk_buff *skb)
				1044	{
				1045	skb->dev = vrf_dev;
				1046	skb->skb_iif = vrf_dev->ifindex;
				1047	IPCB(skb)->flags \|= IPSKB_L3SLAVE;
				1048
				1049	if (ipv4_is_multicast(ip_hdr(skb)->daddr))
				1050	goto out;
				1051
				1052	/* loopback traffic; do not push through packet taps again.
				1053	* Reset pkt_type for upper layers to process skb
				1054	*/
				1055	if (skb->pkt_type == PACKET_LOOPBACK) {
				1056	skb->pkt_type = PACKET_HOST;
				1057	goto out;
				1058	}
				1059
				1060	vrf_rx_stats(vrf_dev, skb->len);
				1061
				1062	if (!list_empty(&vrf_dev->ptype_all)) {
				1063	skb_push(skb, skb->mac_len);
				1064	dev_queue_xmit_nit(skb, vrf_dev);
				1065	skb_pull(skb, skb->mac_len);
				1066	}
				1067
				1068	skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
				1069	out:
				1070	return skb;
				1071	}
				1072
				1073	/* called with rcu lock held */
				1074	static struct sk_buff vrf_l3_rcv(struct net_device vrf_dev,
				1075	struct sk_buff *skb,
				1076	u16 proto)
				1077	{
				1078	switch (proto) {
				1079	case AF_INET:
				1080	return vrf_ip_rcv(vrf_dev, skb);
				1081	case AF_INET6:
				1082	return vrf_ip6_rcv(vrf_dev, skb);
				1083	}
				1084
				1085	return skb;
				1086	}
				1087
				1088	#if IS_ENABLED(CONFIG_IPV6)
				1089	/* send to link-local or multicast address via interface enslaved to
				1090	* VRF device. Force lookup to VRF table without changing flow struct
				1091	*/
				1092	static struct dst_entry vrf_link_scope_lookup(const struct net_device dev,
				1093	struct flowi6 *fl6)
				1094	{
				1095	struct net *net = dev_net(dev);
				1096	int flags = RT6_LOOKUP_F_IFACE;
				1097	struct dst_entry *dst = NULL;
				1098	struct rt6_info *rt;
				1099
				1100	/* VRF device does not have a link-local address and
				1101	* sending packets to link-local or mcast addresses over
				1102	* a VRF device does not make sense
				1103	*/
				1104	if (fl6->flowi6_oif == dev->ifindex) {
				1105	dst = &net->ipv6.ip6_null_entry->dst;
				1106	dst_hold(dst);
				1107	return dst;
				1108	}
				1109
				1110	if (!ipv6_addr_any(&fl6->saddr))
				1111	flags \|= RT6_LOOKUP_F_HAS_SADDR;
				1112
				1113	rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags);
				1114	if (rt)
				1115	dst = &rt->dst;
				1116
				1117	return dst;
				1118	}
				1119	#endif
				1120
				1121	static const struct l3mdev_ops vrf_l3mdev_ops = {
				1122	.l3mdev_fib_table = vrf_fib_table,
				1123	.l3mdev_l3_rcv = vrf_l3_rcv,
				1124	.l3mdev_l3_out = vrf_l3_out,
				1125	#if IS_ENABLED(CONFIG_IPV6)
				1126	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
				1127	#endif
				1128	};
				1129
				1130	static void vrf_get_drvinfo(struct net_device *dev,
				1131	struct ethtool_drvinfo *info)
				1132	{
				1133	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
				1134	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
				1135	}
				1136
				1137	static const struct ethtool_ops vrf_ethtool_ops = {
				1138	.get_drvinfo = vrf_get_drvinfo,
				1139	};
				1140
				1141	static inline size_t vrf_fib_rule_nl_size(void)
				1142	{
				1143	size_t sz;
				1144
				1145	sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
				1146	sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */
				1147	sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */
				1148	sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */
				1149
				1150	return sz;
				1151	}
				1152
				1153	static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
				1154	{
				1155	struct fib_rule_hdr *frh;
				1156	struct nlmsghdr *nlh;
				1157	struct sk_buff *skb;
				1158	int err;
				1159
				1160	if (family == AF_INET6 && !ipv6_mod_enabled())
				1161	return 0;
				1162
				1163	skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
				1164	if (!skb)
				1165	return -ENOMEM;
				1166
				1167	nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0);
				1168	if (!nlh)
				1169	goto nla_put_failure;
				1170
				1171	/* rule only needs to appear once */
				1172	nlh->nlmsg_flags \|= NLM_F_EXCL;
				1173
				1174	frh = nlmsg_data(nlh);
				1175	memset(frh, 0, sizeof(*frh));
				1176	frh->family = family;
				1177	frh->action = FR_ACT_TO_TBL;
				1178
				1179	if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL))
				1180	goto nla_put_failure;
				1181
				1182	if (nla_put_u8(skb, FRA_L3MDEV, 1))
				1183	goto nla_put_failure;
				1184
				1185	if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
				1186	goto nla_put_failure;
				1187
				1188	nlmsg_end(skb, nlh);
				1189
				1190	/* fib_nl_{new,del}rule handling looks for net from skb->sk */
				1191	skb->sk = dev_net(dev)->rtnl;
				1192	if (add_it) {
				1193	err = fib_nl_newrule(skb, nlh, NULL);
				1194	if (err == -EEXIST)
				1195	err = 0;
				1196	} else {
				1197	err = fib_nl_delrule(skb, nlh, NULL);
				1198	if (err == -ENOENT)
				1199	err = 0;
				1200	}
				1201	nlmsg_free(skb);
				1202
				1203	return err;
				1204
				1205	nla_put_failure:
				1206	nlmsg_free(skb);
				1207
				1208	return -EMSGSIZE;
				1209	}
				1210
				1211	static int vrf_add_fib_rules(const struct net_device *dev)
				1212	{
				1213	int err;
				1214
				1215	err = vrf_fib_rule(dev, AF_INET, true);
				1216	if (err < 0)
				1217	goto out_err;
				1218
				1219	err = vrf_fib_rule(dev, AF_INET6, true);
				1220	if (err < 0)
				1221	goto ipv6_err;
				1222
				1223	#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
				1224	err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true);
				1225	if (err < 0)
				1226	goto ipmr_err;
				1227	#endif
				1228
				1229	return 0;
				1230
				1231	#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
				1232	ipmr_err:
				1233	vrf_fib_rule(dev, AF_INET6, false);
				1234	#endif
				1235
				1236	ipv6_err:
				1237	vrf_fib_rule(dev, AF_INET, false);
				1238
				1239	out_err:
				1240	netdev_err(dev, "Failed to add FIB rules.\n");
				1241	return err;
				1242	}
				1243
				1244	static void vrf_setup(struct net_device *dev)
				1245	{
				1246	ether_setup(dev);
				1247
				1248	/* Initialize the device structure. */
				1249	dev->netdev_ops = &vrf_netdev_ops;
				1250	dev->l3mdev_ops = &vrf_l3mdev_ops;
				1251	dev->ethtool_ops = &vrf_ethtool_ops;
				1252	dev->needs_free_netdev = true;
				1253
				1254	/* Fill in device structure with ethernet-generic values. */
				1255	eth_hw_addr_random(dev);
				1256
				1257	/* don't acquire vrf device's netif_tx_lock when transmitting */
				1258	dev->features \|= NETIF_F_LLTX;
				1259
				1260	/* don't allow vrf devices to change network namespaces. */
				1261	dev->features \|= NETIF_F_NETNS_LOCAL;
				1262
				1263	/* does not make sense for a VLAN to be added to a vrf device */
				1264	dev->features \|= NETIF_F_VLAN_CHALLENGED;
				1265
				1266	/* enable offload features */
				1267	dev->features \|= NETIF_F_GSO_SOFTWARE;
				1268	dev->features \|= NETIF_F_RXCSUM \| NETIF_F_HW_CSUM \| NETIF_F_SCTP_CRC;
				1269	dev->features \|= NETIF_F_SG \| NETIF_F_FRAGLIST \| NETIF_F_HIGHDMA;
				1270
				1271	dev->hw_features = dev->features;
				1272	dev->hw_enc_features = dev->features;
				1273
				1274	/* default to no qdisc; user can add if desired */
				1275	dev->priv_flags \|= IFF_NO_QUEUE;
				1276	dev->priv_flags \|= IFF_NO_RX_HANDLER;
				1277	}
				1278
				1279	static int vrf_validate(struct nlattr tb[], struct nlattr data[],
				1280	struct netlink_ext_ack *extack)
				1281	{
				1282	if (tb[IFLA_ADDRESS]) {
				1283	if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
				1284	NL_SET_ERR_MSG(extack, "Invalid hardware address");
				1285	return -EINVAL;
				1286	}
				1287	if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
				1288	NL_SET_ERR_MSG(extack, "Invalid hardware address");
				1289	return -EADDRNOTAVAIL;
				1290	}
				1291	}
				1292	return 0;
				1293	}
				1294
				1295	static void vrf_dellink(struct net_device dev, struct list_head head)
				1296	{
				1297	struct net_device *port_dev;
				1298	struct list_head *iter;
				1299
				1300	netdev_for_each_lower_dev(dev, port_dev, iter)
				1301	vrf_del_slave(dev, port_dev);
				1302
				1303	unregister_netdevice_queue(dev, head);
				1304	}
				1305
				1306	static int vrf_newlink(struct net src_net, struct net_device dev,
				1307	struct nlattr tb[], struct nlattr data[],
				1308	struct netlink_ext_ack *extack)
				1309	{
				1310	struct net_vrf *vrf = netdev_priv(dev);
				1311	bool *add_fib_rules;
				1312	struct net *net;
				1313	int err;
				1314
				1315	if (!data \|\| !data[IFLA_VRF_TABLE]) {
				1316	NL_SET_ERR_MSG(extack, "VRF table id is missing");
				1317	return -EINVAL;
				1318	}
				1319
				1320	vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
				1321	if (vrf->tb_id == RT_TABLE_UNSPEC) {
				1322	NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
				1323	"Invalid VRF table id");
				1324	return -EINVAL;
				1325	}
				1326
				1327	dev->priv_flags \|= IFF_L3MDEV_MASTER;
				1328
				1329	err = register_netdevice(dev);
				1330	if (err)
				1331	goto out;
				1332
				1333	net = dev_net(dev);
				1334	add_fib_rules = net_generic(net, vrf_net_id);
				1335	if (*add_fib_rules) {
				1336	err = vrf_add_fib_rules(dev);
				1337	if (err) {
				1338	unregister_netdevice(dev);
				1339	goto out;
				1340	}
				1341	*add_fib_rules = false;
				1342	}
				1343
				1344	out:
				1345	return err;
				1346	}
				1347
				1348	static size_t vrf_nl_getsize(const struct net_device *dev)
				1349	{
				1350	return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */
				1351	}
				1352
				1353	static int vrf_fillinfo(struct sk_buff *skb,
				1354	const struct net_device *dev)
				1355	{
				1356	struct net_vrf *vrf = netdev_priv(dev);
				1357
				1358	return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
				1359	}
				1360
				1361	static size_t vrf_get_slave_size(const struct net_device *bond_dev,
				1362	const struct net_device *slave_dev)
				1363	{
				1364	return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */
				1365	}
				1366
				1367	static int vrf_fill_slave_info(struct sk_buff *skb,
				1368	const struct net_device *vrf_dev,
				1369	const struct net_device *slave_dev)
				1370	{
				1371	struct net_vrf *vrf = netdev_priv(vrf_dev);
				1372
				1373	if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
				1374	return -EMSGSIZE;
				1375
				1376	return 0;
				1377	}
				1378
				1379	static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
				1380	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
				1381	};
				1382
				1383	static struct rtnl_link_ops vrf_link_ops __read_mostly = {
				1384	.kind = DRV_NAME,
				1385	.priv_size = sizeof(struct net_vrf),
				1386
				1387	.get_size = vrf_nl_getsize,
				1388	.policy = vrf_nl_policy,
				1389	.validate = vrf_validate,
				1390	.fill_info = vrf_fillinfo,
				1391
				1392	.get_slave_size = vrf_get_slave_size,
				1393	.fill_slave_info = vrf_fill_slave_info,
				1394
				1395	.newlink = vrf_newlink,
				1396	.dellink = vrf_dellink,
				1397	.setup = vrf_setup,
				1398	.maxtype = IFLA_VRF_MAX,
				1399	};
				1400
				1401	static int vrf_device_event(struct notifier_block *unused,
				1402	unsigned long event, void *ptr)
				1403	{
				1404	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
				1405
				1406	/* only care about unregister events to drop slave references */
				1407	if (event == NETDEV_UNREGISTER) {
				1408	struct net_device *vrf_dev;
				1409
				1410	if (!netif_is_l3_slave(dev))
				1411	goto out;
				1412
				1413	vrf_dev = netdev_master_upper_dev_get(dev);
				1414	vrf_del_slave(vrf_dev, dev);
				1415	}
				1416	out:
				1417	return NOTIFY_DONE;
				1418	}
				1419
				1420	static struct notifier_block vrf_notifier_block __read_mostly = {
				1421	.notifier_call = vrf_device_event,
				1422	};
				1423
				1424	/* Initialize per network namespace state */
				1425	static int __net_init vrf_netns_init(struct net *net)
				1426	{
				1427	bool *add_fib_rules = net_generic(net, vrf_net_id);
				1428
				1429	*add_fib_rules = true;
				1430
				1431	return 0;
				1432	}
				1433
				1434	static struct pernet_operations vrf_net_ops __net_initdata = {
				1435	.init = vrf_netns_init,
				1436	.id = &vrf_net_id,
				1437	.size = sizeof(bool),
				1438	};
				1439
				1440	static int __init vrf_init_module(void)
				1441	{
				1442	int rc;
				1443
				1444	register_netdevice_notifier(&vrf_notifier_block);
				1445
				1446	rc = register_pernet_subsys(&vrf_net_ops);
				1447	if (rc < 0)
				1448	goto error;
				1449
				1450	rc = rtnl_link_register(&vrf_link_ops);
				1451	if (rc < 0) {
				1452	unregister_pernet_subsys(&vrf_net_ops);
				1453	goto error;
				1454	}
				1455
				1456	return 0;
				1457
				1458	error:
				1459	unregister_netdevice_notifier(&vrf_notifier_block);
				1460	return rc;
				1461	}
				1462
				1463	module_init(vrf_init_module);
				1464	MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
				1465	MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
				1466	MODULE_LICENSE("GPL");
				1467	MODULE_ALIAS_RTNL_LINK(DRV_NAME);
				1468	MODULE_VERSION(DRV_VERSION);