Blame - src/kernel/linux/v4.14/net/rds/send.c - T103

blob: 23f2d81e7967dacd42b8d4a98165714345b37ba0 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
				34	#include <linux/moduleparam.h>
				35	#include <linux/gfp.h>
				36	#include <net/sock.h>
				37	#include <linux/in.h>
				38	#include <linux/list.h>
				39	#include <linux/ratelimit.h>
				40	#include <linux/export.h>
				41	#include <linux/sizes.h>
				42
				43	#include "rds.h"
				44
				45	/* When transmitting messages in rds_send_xmit, we need to emerge from
				46	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				47	* will kick our shin.
				48	* Also, it seems fairer to not let one busy connection stall all the
				49	* others.
				50	*
				51	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				52	* it to 0 will restore the old behavior (where we looped until we had
				53	* drained the queue).
				54	*/
				55	static int send_batch_count = SZ_1K;
				56	module_param(send_batch_count, int, 0444);
				57	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				58
				59	static void rds_send_remove_from_sock(struct list_head *messages, int status);
				60
				61	/*
				62	* Reset the send state. Callers must ensure that this doesn't race with
				63	* rds_send_xmit().
				64	*/
				65	void rds_send_path_reset(struct rds_conn_path *cp)
				66	{
				67	struct rds_message rm, tmp;
				68	unsigned long flags;
				69
				70	if (cp->cp_xmit_rm) {
				71	rm = cp->cp_xmit_rm;
				72	cp->cp_xmit_rm = NULL;
				73	/* Tell the user the RDMA op is no longer mapped by the
				74	* transport. This isn't entirely true (it's flushed out
				75	* independently) but as the connection is down, there's
				76	* no ongoing RDMA to/from that memory */
				77	rds_message_unmapped(rm);
				78	rds_message_put(rm);
				79	}
				80
				81	cp->cp_xmit_sg = 0;
				82	cp->cp_xmit_hdr_off = 0;
				83	cp->cp_xmit_data_off = 0;
				84	cp->cp_xmit_atomic_sent = 0;
				85	cp->cp_xmit_rdma_sent = 0;
				86	cp->cp_xmit_data_sent = 0;
				87
				88	cp->cp_conn->c_map_queued = 0;
				89
				90	cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
				91	cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
				92
				93	/* Mark messages as retransmissions, and move them to the send q */
				94	spin_lock_irqsave(&cp->cp_lock, flags);
				95	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
				96	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				97	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				98	}
				99	list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
				100	spin_unlock_irqrestore(&cp->cp_lock, flags);
				101	}
				102	EXPORT_SYMBOL_GPL(rds_send_path_reset);
				103
				104	static int acquire_in_xmit(struct rds_conn_path *cp)
				105	{
				106	return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
				107	}
				108
				109	static void release_in_xmit(struct rds_conn_path *cp)
				110	{
				111	clear_bit(RDS_IN_XMIT, &cp->cp_flags);
				112	smp_mb__after_atomic();
				113	/*
				114	* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
				115	* hot path and finding waiters is very rare. We don't want to walk
				116	* the system-wide hashed waitqueue buckets in the fast path only to
				117	* almost never find waiters.
				118	*/
				119	if (waitqueue_active(&cp->cp_waitq))
				120	wake_up_all(&cp->cp_waitq);
				121	}
				122
				123	/*
				124	* We're making the conscious trade-off here to only send one message
				125	* down the connection at a time.
				126	* Pro:
				127	* - tx queueing is a simple fifo list
				128	* - reassembly is optional and easily done by transports per conn
				129	* - no per flow rx lookup at all, straight to the socket
				130	* - less per-frag memory and wire overhead
				131	* Con:
				132	* - queued acks can be delayed behind large messages
				133	* Depends:
				134	* - small message latency is higher behind queued large messages
				135	* - large message latency isn't starved by intervening small sends
				136	*/
				137	int rds_send_xmit(struct rds_conn_path *cp)
				138	{
				139	struct rds_connection *conn = cp->cp_conn;
				140	struct rds_message *rm;
				141	unsigned long flags;
				142	unsigned int tmp;
				143	struct scatterlist *sg;
				144	int ret = 0;
				145	LIST_HEAD(to_be_dropped);
				146	int batch_count;
				147	unsigned long send_gen = 0;
				148
				149	restart:
				150	batch_count = 0;
				151
				152	/*
				153	* sendmsg calls here after having queued its message on the send
				154	* queue. We only have one task feeding the connection at a time. If
				155	* another thread is already feeding the queue then we back off. This
				156	* avoids blocking the caller and trading per-connection data between
				157	* caches per message.
				158	*/
				159	if (!acquire_in_xmit(cp)) {
				160	rds_stats_inc(s_send_lock_contention);
				161	ret = -ENOMEM;
				162	goto out;
				163	}
				164
				165	/*
				166	* we record the send generation after doing the xmit acquire.
				167	* if someone else manages to jump in and do some work, we'll use
				168	* this to avoid a goto restart farther down.
				169	*
				170	* The acquire_in_xmit() check above ensures that only one
				171	* caller can increment c_send_gen at any time.
				172	*/
				173	send_gen = READ_ONCE(cp->cp_send_gen) + 1;
				174	WRITE_ONCE(cp->cp_send_gen, send_gen);
				175
				176	/*
				177	* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
				178	* we do the opposite to avoid races.
				179	*/
				180	if (!rds_conn_path_up(cp)) {
				181	release_in_xmit(cp);
				182	ret = 0;
				183	goto out;
				184	}
				185
				186	if (conn->c_trans->xmit_path_prepare)
				187	conn->c_trans->xmit_path_prepare(cp);
				188
				189	/*
				190	* spin trying to push headers and data down the connection until
				191	* the connection doesn't make forward progress.
				192	*/
				193	while (1) {
				194
				195	rm = cp->cp_xmit_rm;
				196
				197	/*
				198	* If between sending messages, we can send a pending congestion
				199	* map update.
				200	*/
				201	if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
				202	rm = rds_cong_update_alloc(conn);
				203	if (IS_ERR(rm)) {
				204	ret = PTR_ERR(rm);
				205	break;
				206	}
				207	rm->data.op_active = 1;
				208	rm->m_inc.i_conn_path = cp;
				209	rm->m_inc.i_conn = cp->cp_conn;
				210
				211	cp->cp_xmit_rm = rm;
				212	}
				213
				214	/*
				215	* If not already working on one, grab the next message.
				216	*
				217	* cp_xmit_rm holds a ref while we're sending this message down
				218	* the connction. We can use this ref while holding the
				219	* send_sem.. rds_send_reset() is serialized with it.
				220	*/
				221	if (!rm) {
				222	unsigned int len;
				223
				224	batch_count++;
				225
				226	/* we want to process as big a batch as we can, but
				227	* we also want to avoid softlockups. If we've been
				228	* through a lot of messages, lets back off and see
				229	* if anyone else jumps in
				230	*/
				231	if (batch_count >= send_batch_count)
				232	goto over_batch;
				233
				234	spin_lock_irqsave(&cp->cp_lock, flags);
				235
				236	if (!list_empty(&cp->cp_send_queue)) {
				237	rm = list_entry(cp->cp_send_queue.next,
				238	struct rds_message,
				239	m_conn_item);
				240	rds_message_addref(rm);
				241
				242	/*
				243	* Move the message from the send queue to the retransmit
				244	* list right away.
				245	*/
				246	list_move_tail(&rm->m_conn_item,
				247	&cp->cp_retrans);
				248	}
				249
				250	spin_unlock_irqrestore(&cp->cp_lock, flags);
				251
				252	if (!rm)
				253	break;
				254
				255	/* Unfortunately, the way Infiniband deals with
				256	* RDMA to a bad MR key is by moving the entire
				257	* queue pair to error state. We cold possibly
				258	* recover from that, but right now we drop the
				259	* connection.
				260	* Therefore, we never retransmit messages with RDMA ops.
				261	*/
				262	if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) \|\|
				263	(rm->rdma.op_active &&
				264	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
				265	spin_lock_irqsave(&cp->cp_lock, flags);
				266	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				267	list_move(&rm->m_conn_item, &to_be_dropped);
				268	spin_unlock_irqrestore(&cp->cp_lock, flags);
				269	continue;
				270	}
				271
				272	/* Require an ACK every once in a while */
				273	len = ntohl(rm->m_inc.i_hdr.h_len);
				274	if (cp->cp_unacked_packets == 0 \|\|
				275	cp->cp_unacked_bytes < len) {
				276	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				277
				278	cp->cp_unacked_packets =
				279	rds_sysctl_max_unacked_packets;
				280	cp->cp_unacked_bytes =
				281	rds_sysctl_max_unacked_bytes;
				282	rds_stats_inc(s_send_ack_required);
				283	} else {
				284	cp->cp_unacked_bytes -= len;
				285	cp->cp_unacked_packets--;
				286	}
				287
				288	cp->cp_xmit_rm = rm;
				289	}
				290
				291	/* The transport either sends the whole rdma or none of it */
				292	if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
				293	rm->m_final_op = &rm->rdma;
				294	/* The transport owns the mapped memory for now.
				295	* You can't unmap it while it's on the send queue
				296	*/
				297	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				298	ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
				299	if (ret) {
				300	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
				301	wake_up_interruptible(&rm->m_flush_wait);
				302	break;
				303	}
				304	cp->cp_xmit_rdma_sent = 1;
				305
				306	}
				307
				308	if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
				309	rm->m_final_op = &rm->atomic;
				310	/* The transport owns the mapped memory for now.
				311	* You can't unmap it while it's on the send queue
				312	*/
				313	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				314	ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
				315	if (ret) {
				316	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
				317	wake_up_interruptible(&rm->m_flush_wait);
				318	break;
				319	}
				320	cp->cp_xmit_atomic_sent = 1;
				321
				322	}
				323
				324	/*
				325	* A number of cases require an RDS header to be sent
				326	* even if there is no data.
				327	* We permit 0-byte sends; rds-ping depends on this.
				328	* However, if there are exclusively attached silent ops,
				329	* we skip the hdr/data send, to enable silent operation.
				330	*/
				331	if (rm->data.op_nents == 0) {
				332	int ops_present;
				333	int all_ops_are_silent = 1;
				334
				335	ops_present = (rm->atomic.op_active \|\| rm->rdma.op_active);
				336	if (rm->atomic.op_active && !rm->atomic.op_silent)
				337	all_ops_are_silent = 0;
				338	if (rm->rdma.op_active && !rm->rdma.op_silent)
				339	all_ops_are_silent = 0;
				340
				341	if (ops_present && all_ops_are_silent
				342	&& !rm->m_rdma_cookie)
				343	rm->data.op_active = 0;
				344	}
				345
				346	if (rm->data.op_active && !cp->cp_xmit_data_sent) {
				347	rm->m_final_op = &rm->data;
				348
				349	ret = conn->c_trans->xmit(conn, rm,
				350	cp->cp_xmit_hdr_off,
				351	cp->cp_xmit_sg,
				352	cp->cp_xmit_data_off);
				353	if (ret <= 0)
				354	break;
				355
				356	if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
				357	tmp = min_t(int, ret,
				358	sizeof(struct rds_header) -
				359	cp->cp_xmit_hdr_off);
				360	cp->cp_xmit_hdr_off += tmp;
				361	ret -= tmp;
				362	}
				363
				364	sg = &rm->data.op_sg[cp->cp_xmit_sg];
				365	while (ret) {
				366	tmp = min_t(int, ret, sg->length -
				367	cp->cp_xmit_data_off);
				368	cp->cp_xmit_data_off += tmp;
				369	ret -= tmp;
				370	if (cp->cp_xmit_data_off == sg->length) {
				371	cp->cp_xmit_data_off = 0;
				372	sg++;
				373	cp->cp_xmit_sg++;
				374	BUG_ON(ret != 0 && cp->cp_xmit_sg ==
				375	rm->data.op_nents);
				376	}
				377	}
				378
				379	if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
				380	(cp->cp_xmit_sg == rm->data.op_nents))
				381	cp->cp_xmit_data_sent = 1;
				382	}
				383
				384	/*
				385	* A rm will only take multiple times through this loop
				386	* if there is a data op. Thus, if the data is sent (or there was
				387	* none), then we're done with the rm.
				388	*/
				389	if (!rm->data.op_active \|\| cp->cp_xmit_data_sent) {
				390	cp->cp_xmit_rm = NULL;
				391	cp->cp_xmit_sg = 0;
				392	cp->cp_xmit_hdr_off = 0;
				393	cp->cp_xmit_data_off = 0;
				394	cp->cp_xmit_rdma_sent = 0;
				395	cp->cp_xmit_atomic_sent = 0;
				396	cp->cp_xmit_data_sent = 0;
				397
				398	rds_message_put(rm);
				399	}
				400	}
				401
				402	over_batch:
				403	if (conn->c_trans->xmit_path_complete)
				404	conn->c_trans->xmit_path_complete(cp);
				405	release_in_xmit(cp);
				406
				407	/* Nuke any messages we decided not to retransmit. */
				408	if (!list_empty(&to_be_dropped)) {
				409	/* irqs on here, so we can put(), unlike above */
				410	list_for_each_entry(rm, &to_be_dropped, m_conn_item)
				411	rds_message_put(rm);
				412	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				413	}
				414
				415	/*
				416	* Other senders can queue a message after we last test the send queue
				417	* but before we clear RDS_IN_XMIT. In that case they'd back off and
				418	* not try and send their newly queued message. We need to check the
				419	* send queue after having cleared RDS_IN_XMIT so that their message
				420	* doesn't get stuck on the send queue.
				421	*
				422	* If the transport cannot continue (i.e ret != 0), then it must
				423	* call us when more room is available, such as from the tx
				424	* completion handler.
				425	*
				426	* We have an extra generation check here so that if someone manages
				427	* to jump in after our release_in_xmit, we'll see that they have done
				428	* some work and we will skip our goto
				429	*/
				430	if (ret == 0) {
				431	bool raced;
				432
				433	smp_mb();
				434	raced = send_gen != READ_ONCE(cp->cp_send_gen);
				435
				436	if ((test_bit(0, &conn->c_map_queued) \|\|
				437	!list_empty(&cp->cp_send_queue)) && !raced) {
				438	if (batch_count < send_batch_count)
				439	goto restart;
				440	queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
				441	} else if (raced) {
				442	rds_stats_inc(s_send_lock_queue_raced);
				443	}
				444	}
				445	out:
				446	return ret;
				447	}
				448	EXPORT_SYMBOL_GPL(rds_send_xmit);
				449
				450	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				451	{
				452	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				453
				454	assert_spin_locked(&rs->rs_lock);
				455
				456	BUG_ON(rs->rs_snd_bytes < len);
				457	rs->rs_snd_bytes -= len;
				458
				459	if (rs->rs_snd_bytes == 0)
				460	rds_stats_inc(s_send_queue_empty);
				461	}
				462
				463	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				464	is_acked_func is_acked)
				465	{
				466	if (is_acked)
				467	return is_acked(rm, ack);
				468	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				469	}
				470
				471	/*
				472	* This is pretty similar to what happens below in the ACK
				473	* handling code - except that we call here as soon as we get
				474	* the IB send completion on the RDMA op and the accompanying
				475	* message.
				476	*/
				477	void rds_rdma_send_complete(struct rds_message *rm, int status)
				478	{
				479	struct rds_sock *rs = NULL;
				480	struct rm_rdma_op *ro;
				481	struct rds_notifier *notifier;
				482	unsigned long flags;
				483	unsigned int notify = 0;
				484
				485	spin_lock_irqsave(&rm->m_rs_lock, flags);
				486
				487	notify = rm->rdma.op_notify \| rm->data.op_notify;
				488	ro = &rm->rdma;
				489	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
				490	ro->op_active && notify && ro->op_notifier) {
				491	notifier = ro->op_notifier;
				492	rs = rm->m_rs;
				493	sock_hold(rds_rs_to_sk(rs));
				494
				495	notifier->n_status = status;
				496	spin_lock(&rs->rs_lock);
				497	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				498	spin_unlock(&rs->rs_lock);
				499
				500	ro->op_notifier = NULL;
				501	}
				502
				503	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				504
				505	if (rs) {
				506	rds_wake_sk_sleep(rs);
				507	sock_put(rds_rs_to_sk(rs));
				508	}
				509	}
				510	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
				511
				512	/*
				513	* Just like above, except looks at atomic op
				514	*/
				515	void rds_atomic_send_complete(struct rds_message *rm, int status)
				516	{
				517	struct rds_sock *rs = NULL;
				518	struct rm_atomic_op *ao;
				519	struct rds_notifier *notifier;
				520	unsigned long flags;
				521
				522	spin_lock_irqsave(&rm->m_rs_lock, flags);
				523
				524	ao = &rm->atomic;
				525	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
				526	&& ao->op_active && ao->op_notify && ao->op_notifier) {
				527	notifier = ao->op_notifier;
				528	rs = rm->m_rs;
				529	sock_hold(rds_rs_to_sk(rs));
				530
				531	notifier->n_status = status;
				532	spin_lock(&rs->rs_lock);
				533	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				534	spin_unlock(&rs->rs_lock);
				535
				536	ao->op_notifier = NULL;
				537	}
				538
				539	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				540
				541	if (rs) {
				542	rds_wake_sk_sleep(rs);
				543	sock_put(rds_rs_to_sk(rs));
				544	}
				545	}
				546	EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
				547
				548	/*
				549	* This is the same as rds_rdma_send_complete except we
				550	* don't do any locking - we have all the ingredients (message,
				551	* socket, socket lock) and can just move the notifier.
				552	*/
				553	static inline void
				554	__rds_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				555	{
				556	struct rm_rdma_op *ro;
				557	struct rm_atomic_op *ao;
				558
				559	ro = &rm->rdma;
				560	if (ro->op_active && ro->op_notify && ro->op_notifier) {
				561	ro->op_notifier->n_status = status;
				562	list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
				563	ro->op_notifier = NULL;
				564	}
				565
				566	ao = &rm->atomic;
				567	if (ao->op_active && ao->op_notify && ao->op_notifier) {
				568	ao->op_notifier->n_status = status;
				569	list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
				570	ao->op_notifier = NULL;
				571	}
				572
				573	/* No need to wake the app - caller does this */
				574	}
				575
				576	/*
				577	* This removes messages from the socket's list if they're on it. The list
				578	* argument must be private to the caller, we must be able to modify it
				579	* without locks. The messages must have a reference held for their
				580	* position on the list. This function will drop that reference after
				581	* removing the messages from the 'messages' list regardless of if it found
				582	* the messages on the socket list or not.
				583	*/
				584	static void rds_send_remove_from_sock(struct list_head *messages, int status)
				585	{
				586	unsigned long flags;
				587	struct rds_sock *rs = NULL;
				588	struct rds_message *rm;
				589
				590	while (!list_empty(messages)) {
				591	int was_on_sock = 0;
				592
				593	rm = list_entry(messages->next, struct rds_message,
				594	m_conn_item);
				595	list_del_init(&rm->m_conn_item);
				596
				597	/*
				598	* If we see this flag cleared then we're sure that someone
				599	* else beat us to removing it from the sock. If we race
				600	* with their flag update we'll get the lock and then really
				601	* see that the flag has been cleared.
				602	*
				603	* The message spinlock makes sure nobody clears rm->m_rs
				604	* while we're messing with it. It does not prevent the
				605	* message from being removed from the socket, though.
				606	*/
				607	spin_lock_irqsave(&rm->m_rs_lock, flags);
				608	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				609	goto unlock_and_drop;
				610
				611	if (rs != rm->m_rs) {
				612	if (rs) {
				613	rds_wake_sk_sleep(rs);
				614	sock_put(rds_rs_to_sk(rs));
				615	}
				616	rs = rm->m_rs;
				617	if (rs)
				618	sock_hold(rds_rs_to_sk(rs));
				619	}
				620	if (!rs)
				621	goto unlock_and_drop;
				622	spin_lock(&rs->rs_lock);
				623
				624	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				625	struct rm_rdma_op *ro = &rm->rdma;
				626	struct rds_notifier *notifier;
				627
				628	list_del_init(&rm->m_sock_item);
				629	rds_send_sndbuf_remove(rs, rm);
				630
				631	if (ro->op_active && ro->op_notifier &&
				632	(ro->op_notify \|\| (ro->op_recverr && status))) {
				633	notifier = ro->op_notifier;
				634	list_add_tail(&notifier->n_list,
				635	&rs->rs_notify_queue);
				636	if (!notifier->n_status)
				637	notifier->n_status = status;
				638	rm->rdma.op_notifier = NULL;
				639	}
				640	was_on_sock = 1;
				641	rm->m_rs = NULL;
				642	}
				643	spin_unlock(&rs->rs_lock);
				644
				645	unlock_and_drop:
				646	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				647	rds_message_put(rm);
				648	if (was_on_sock)
				649	rds_message_put(rm);
				650	}
				651
				652	if (rs) {
				653	rds_wake_sk_sleep(rs);
				654	sock_put(rds_rs_to_sk(rs));
				655	}
				656	}
				657
				658	/*
				659	* Transports call here when they've determined that the receiver queued
				660	* messages up to, and including, the given sequence number. Messages are
				661	* moved to the retrans queue when rds_send_xmit picks them off the send
				662	* queue. This means that in the TCP case, the message may not have been
				663	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				664	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				665	*/
				666	void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
				667	is_acked_func is_acked)
				668	{
				669	struct rds_message rm, tmp;
				670	unsigned long flags;
				671	LIST_HEAD(list);
				672
				673	spin_lock_irqsave(&cp->cp_lock, flags);
				674
				675	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
				676	if (!rds_send_is_acked(rm, ack, is_acked))
				677	break;
				678
				679	list_move(&rm->m_conn_item, &list);
				680	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				681	}
				682
				683	/* order flag updates with spin locks */
				684	if (!list_empty(&list))
				685	smp_mb__after_atomic();
				686
				687	spin_unlock_irqrestore(&cp->cp_lock, flags);
				688
				689	/* now remove the messages from the sock list as needed */
				690	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				691	}
				692	EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
				693
				694	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				695	is_acked_func is_acked)
				696	{
				697	WARN_ON(conn->c_trans->t_mp_capable);
				698	rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
				699	}
				700	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
				701
				702	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				703	{
				704	struct rds_message rm, tmp;
				705	struct rds_connection *conn;
				706	struct rds_conn_path *cp;
				707	unsigned long flags;
				708	LIST_HEAD(list);
				709
				710	/* get all the messages we're dropping under the rs lock */
				711	spin_lock_irqsave(&rs->rs_lock, flags);
				712
				713	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				714	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				715	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				716	continue;
				717
				718	list_move(&rm->m_sock_item, &list);
				719	rds_send_sndbuf_remove(rs, rm);
				720	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				721	}
				722
				723	/* order flag updates with the rs lock */
				724	smp_mb__after_atomic();
				725
				726	spin_unlock_irqrestore(&rs->rs_lock, flags);
				727
				728	if (list_empty(&list))
				729	return;
				730
				731	/* Remove the messages from the conn */
				732	list_for_each_entry(rm, &list, m_sock_item) {
				733
				734	conn = rm->m_inc.i_conn;
				735	if (conn->c_trans->t_mp_capable)
				736	cp = rm->m_inc.i_conn_path;
				737	else
				738	cp = &conn->c_path[0];
				739
				740	spin_lock_irqsave(&cp->cp_lock, flags);
				741	/*
				742	* Maybe someone else beat us to removing rm from the conn.
				743	* If we race with their flag update we'll get the lock and
				744	* then really see that the flag has been cleared.
				745	*/
				746	if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				747	spin_unlock_irqrestore(&cp->cp_lock, flags);
				748	spin_lock_irqsave(&rm->m_rs_lock, flags);
				749	rm->m_rs = NULL;
				750	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				751	continue;
				752	}
				753	list_del_init(&rm->m_conn_item);
				754	spin_unlock_irqrestore(&cp->cp_lock, flags);
				755
				756	/*
				757	* Couldn't grab m_rs_lock in top loop (lock ordering),
				758	* but we can now.
				759	*/
				760	spin_lock_irqsave(&rm->m_rs_lock, flags);
				761
				762	spin_lock(&rs->rs_lock);
				763	__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
				764	spin_unlock(&rs->rs_lock);
				765
				766	rm->m_rs = NULL;
				767	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				768
				769	rds_message_put(rm);
				770	}
				771
				772	rds_wake_sk_sleep(rs);
				773
				774	while (!list_empty(&list)) {
				775	rm = list_entry(list.next, struct rds_message, m_sock_item);
				776	list_del_init(&rm->m_sock_item);
				777	rds_message_wait(rm);
				778
				779	/* just in case the code above skipped this message
				780	* because RDS_MSG_ON_CONN wasn't set, run it again here
				781	* taking m_rs_lock is the only thing that keeps us
				782	* from racing with ack processing.
				783	*/
				784	spin_lock_irqsave(&rm->m_rs_lock, flags);
				785
				786	spin_lock(&rs->rs_lock);
				787	__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
				788	spin_unlock(&rs->rs_lock);
				789
				790	rm->m_rs = NULL;
				791	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				792
				793	rds_message_put(rm);
				794	}
				795	}
				796
				797	/*
				798	* we only want this to fire once so we use the callers 'queued'. It's
				799	* possible that another thread can race with us and remove the
				800	* message from the flow with RDS_CANCEL_SENT_TO.
				801	*/
				802	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				803	struct rds_conn_path *cp,
				804	struct rds_message *rm, __be16 sport,
				805	__be16 dport, int *queued)
				806	{
				807	unsigned long flags;
				808	u32 len;
				809
				810	if (*queued)
				811	goto out;
				812
				813	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				814
				815	/* this is the only place which holds both the socket's rs_lock
				816	* and the connection's c_lock */
				817	spin_lock_irqsave(&rs->rs_lock, flags);
				818
				819	/*
				820	* If there is a little space in sndbuf, we don't queue anything,
				821	* and userspace gets -EAGAIN. But poll() indicates there's send
				822	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				823	* freed up by incoming acks. So we check the old value of
				824	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				825	* and poll() now knows no more data can be sent.
				826	*/
				827	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				828	rs->rs_snd_bytes += len;
				829
				830	/* let recv side know we are close to send space exhaustion.
				831	* This is probably not the optimal way to do it, as this
				832	* means we set the flag on all messages as soon as our
				833	* throughput hits a certain threshold.
				834	*/
				835	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				836	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				837
				838	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				839	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				840	rds_message_addref(rm);
				841	rm->m_rs = rs;
				842
				843	/* The code ordering is a little weird, but we're
				844	trying to minimize the time we hold c_lock */
				845	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				846	rm->m_inc.i_conn = conn;
				847	rm->m_inc.i_conn_path = cp;
				848	rds_message_addref(rm);
				849
				850	spin_lock(&cp->cp_lock);
				851	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
				852	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
				853	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				854	spin_unlock(&cp->cp_lock);
				855
				856	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				857	rm, len, rs, rs->rs_snd_bytes,
				858	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				859
				860	*queued = 1;
				861	}
				862
				863	spin_unlock_irqrestore(&rs->rs_lock, flags);
				864	out:
				865	return *queued;
				866	}
				867
				868	/*
				869	* rds_message is getting to be quite complicated, and we'd like to allocate
				870	* it all in one go. This figures out how big it needs to be up front.
				871	*/
				872	static int rds_rm_size(struct msghdr *msg, int data_len)
				873	{
				874	struct cmsghdr *cmsg;
				875	int size = 0;
				876	int cmsg_groups = 0;
				877	int retval;
				878
				879	for_each_cmsghdr(cmsg, msg) {
				880	if (!CMSG_OK(msg, cmsg))
				881	return -EINVAL;
				882
				883	if (cmsg->cmsg_level != SOL_RDS)
				884	continue;
				885
				886	switch (cmsg->cmsg_type) {
				887	case RDS_CMSG_RDMA_ARGS:
				888	cmsg_groups \|= 1;
				889	retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
				890	if (retval < 0)
				891	return retval;
				892	size += retval;
				893
				894	break;
				895
				896	case RDS_CMSG_RDMA_DEST:
				897	case RDS_CMSG_RDMA_MAP:
				898	cmsg_groups \|= 2;
				899	/* these are valid but do no add any size */
				900	break;
				901
				902	case RDS_CMSG_ATOMIC_CSWP:
				903	case RDS_CMSG_ATOMIC_FADD:
				904	case RDS_CMSG_MASKED_ATOMIC_CSWP:
				905	case RDS_CMSG_MASKED_ATOMIC_FADD:
				906	cmsg_groups \|= 1;
				907	size += sizeof(struct scatterlist);
				908	break;
				909
				910	default:
				911	return -EINVAL;
				912	}
				913
				914	}
				915
				916	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
				917
				918	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
				919	if (cmsg_groups == 3)
				920	return -EINVAL;
				921
				922	return size;
				923	}
				924
				925	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				926	struct msghdr msg, int allocated_mr)
				927	{
				928	struct cmsghdr *cmsg;
				929	int ret = 0;
				930
				931	for_each_cmsghdr(cmsg, msg) {
				932	if (!CMSG_OK(msg, cmsg))
				933	return -EINVAL;
				934
				935	if (cmsg->cmsg_level != SOL_RDS)
				936	continue;
				937
				938	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				939	* rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
				940	*/
				941	switch (cmsg->cmsg_type) {
				942	case RDS_CMSG_RDMA_ARGS:
				943	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				944	break;
				945
				946	case RDS_CMSG_RDMA_DEST:
				947	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				948	break;
				949
				950	case RDS_CMSG_RDMA_MAP:
				951	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				952	if (!ret)
				953	*allocated_mr = 1;
				954	else if (ret == -ENODEV)
				955	/* Accommodate the get_mr() case which can fail
				956	* if connection isn't established yet.
				957	*/
				958	ret = -EAGAIN;
				959	break;
				960	case RDS_CMSG_ATOMIC_CSWP:
				961	case RDS_CMSG_ATOMIC_FADD:
				962	case RDS_CMSG_MASKED_ATOMIC_CSWP:
				963	case RDS_CMSG_MASKED_ATOMIC_FADD:
				964	ret = rds_cmsg_atomic(rs, rm, cmsg);
				965	break;
				966
				967	default:
				968	return -EINVAL;
				969	}
				970
				971	if (ret)
				972	break;
				973	}
				974
				975	return ret;
				976	}
				977
				978	static int rds_send_mprds_hash(struct rds_sock rs, struct rds_connection conn)
				979	{
				980	int hash;
				981
				982	if (conn->c_npaths == 0)
				983	hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
				984	else
				985	hash = RDS_MPATH_HASH(rs, conn->c_npaths);
				986	if (conn->c_npaths == 0 && hash != 0) {
				987	rds_send_ping(conn, 0);
				988
				989	/* The underlying connection is not up yet. Need to wait
				990	* until it is up to be sure that the non-zero c_path can be
				991	* used. But if we are interrupted, we have to use the zero
				992	* c_path in case the connection ends up being non-MP capable.
				993	*/
				994	if (conn->c_npaths == 0)
				995	if (wait_event_interruptible(conn->c_hs_waitq,
				996	conn->c_npaths != 0))
				997	hash = 0;
				998	if (conn->c_npaths == 1)
				999	hash = 0;
				1000	}
				1001	return hash;
				1002	}
				1003
				1004	static int rds_rdma_bytes(struct msghdr msg, size_t rdma_bytes)
				1005	{
				1006	struct rds_rdma_args *args;
				1007	struct cmsghdr *cmsg;
				1008
				1009	for_each_cmsghdr(cmsg, msg) {
				1010	if (!CMSG_OK(msg, cmsg))
				1011	return -EINVAL;
				1012
				1013	if (cmsg->cmsg_level != SOL_RDS)
				1014	continue;
				1015
				1016	if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
				1017	if (cmsg->cmsg_len <
				1018	CMSG_LEN(sizeof(struct rds_rdma_args)))
				1019	return -EINVAL;
				1020	args = CMSG_DATA(cmsg);
				1021	*rdma_bytes += args->remote_vec.bytes;
				1022	}
				1023	}
				1024	return 0;
				1025	}
				1026
				1027	int rds_sendmsg(struct socket sock, struct msghdr msg, size_t payload_len)
				1028	{
				1029	struct sock *sk = sock->sk;
				1030	struct rds_sock *rs = rds_sk_to_rs(sk);
				1031	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
				1032	__be32 daddr;
				1033	__be16 dport;
				1034	struct rds_message *rm = NULL;
				1035	struct rds_connection *conn;
				1036	int ret = 0;
				1037	int queued = 0, allocated_mr = 0;
				1038	int nonblock = msg->msg_flags & MSG_DONTWAIT;
				1039	long timeo = sock_sndtimeo(sk, nonblock);
				1040	struct rds_conn_path *cpath;
				1041	size_t total_payload_len = payload_len, rdma_payload_len = 0;
				1042
				1043	/* Mirror Linux UDP mirror of BSD error message compatibility */
				1044	/* XXX: Perhaps MSG_MORE someday */
				1045	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				1046	ret = -EOPNOTSUPP;
				1047	goto out;
				1048	}
				1049
				1050	if (msg->msg_namelen) {
				1051	/* XXX fail non-unicast destination IPs? */
				1052	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				1053	ret = -EINVAL;
				1054	goto out;
				1055	}
				1056	daddr = usin->sin_addr.s_addr;
				1057	dport = usin->sin_port;
				1058	} else {
				1059	/* We only care about consistency with ->connect() */
				1060	lock_sock(sk);
				1061	daddr = rs->rs_conn_addr;
				1062	dport = rs->rs_conn_port;
				1063	release_sock(sk);
				1064	}
				1065
				1066	lock_sock(sk);
				1067	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				1068	release_sock(sk);
				1069	ret = -ENOTCONN; /* XXX not a great errno */
				1070	goto out;
				1071	}
				1072	release_sock(sk);
				1073
				1074	ret = rds_rdma_bytes(msg, &rdma_payload_len);
				1075	if (ret)
				1076	goto out;
				1077
				1078	total_payload_len += rdma_payload_len;
				1079	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
				1080	ret = -EMSGSIZE;
				1081	goto out;
				1082	}
				1083
				1084	if (payload_len > rds_sk_sndbuf(rs)) {
				1085	ret = -EMSGSIZE;
				1086	goto out;
				1087	}
				1088
				1089	/* size of rm including all sgs */
				1090	ret = rds_rm_size(msg, payload_len);
				1091	if (ret < 0)
				1092	goto out;
				1093
				1094	rm = rds_message_alloc(ret, GFP_KERNEL);
				1095	if (!rm) {
				1096	ret = -ENOMEM;
				1097	goto out;
				1098	}
				1099
				1100	/* Attach data to the rm */
				1101	if (payload_len) {
				1102	rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
				1103	if (!rm->data.op_sg) {
				1104	ret = -ENOMEM;
				1105	goto out;
				1106	}
				1107	ret = rds_message_copy_from_user(rm, &msg->msg_iter);
				1108	if (ret)
				1109	goto out;
				1110	}
				1111	rm->data.op_active = 1;
				1112
				1113	rm->m_daddr = daddr;
				1114
				1115	/* rds_conn_create has a spinlock that runs with IRQ off.
				1116	* Caching the conn in the socket helps a lot. */
				1117	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				1118	conn = rs->rs_conn;
				1119	else {
				1120	conn = rds_conn_create_outgoing(sock_net(sock->sk),
				1121	rs->rs_bound_addr, daddr,
				1122	rs->rs_transport,
				1123	sock->sk->sk_allocation);
				1124	if (IS_ERR(conn)) {
				1125	ret = PTR_ERR(conn);
				1126	goto out;
				1127	}
				1128	rs->rs_conn = conn;
				1129	}
				1130
				1131	/* Parse any control messages the user may have included. */
				1132	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				1133	if (ret) {
				1134	/* Trigger connection so that its ready for the next retry */
				1135	if (ret == -EAGAIN)
				1136	rds_conn_connect_if_down(conn);
				1137	goto out;
				1138	}
				1139
				1140	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
				1141	printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				1142	&rm->rdma, conn->c_trans->xmit_rdma);
				1143	ret = -EOPNOTSUPP;
				1144	goto out;
				1145	}
				1146
				1147	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
				1148	printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
				1149	&rm->atomic, conn->c_trans->xmit_atomic);
				1150	ret = -EOPNOTSUPP;
				1151	goto out;
				1152	}
				1153
				1154	if (conn->c_trans->t_mp_capable)
				1155	cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
				1156	else
				1157	cpath = &conn->c_path[0];
				1158
				1159	rds_conn_path_connect_if_down(cpath);
				1160
				1161	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
				1162	if (ret) {
				1163	rs->rs_seen_congestion = 1;
				1164	goto out;
				1165	}
				1166	while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
				1167	dport, &queued)) {
				1168	rds_stats_inc(s_send_queue_full);
				1169
				1170	if (nonblock) {
				1171	ret = -EAGAIN;
				1172	goto out;
				1173	}
				1174
				1175	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
				1176	rds_send_queue_rm(rs, conn, cpath, rm,
				1177	rs->rs_bound_port,
				1178	dport,
				1179	&queued),
				1180	timeo);
				1181	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				1182	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				1183	continue;
				1184
				1185	ret = timeo;
				1186	if (ret == 0)
				1187	ret = -ETIMEDOUT;
				1188	goto out;
				1189	}
				1190
				1191	/*
				1192	* By now we've committed to the send. We reuse rds_send_worker()
				1193	* to retry sends in the rds thread if the transport asks us to.
				1194	*/
				1195	rds_stats_inc(s_send_queued);
				1196
				1197	ret = rds_send_xmit(cpath);
				1198	if (ret == -ENOMEM \|\| ret == -EAGAIN)
				1199	queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
				1200
				1201	rds_message_put(rm);
				1202	return payload_len;
				1203
				1204	out:
				1205	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				1206	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				1207	* or in any other way, we need to destroy the MR again */
				1208	if (allocated_mr)
				1209	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				1210
				1211	if (rm)
				1212	rds_message_put(rm);
				1213	return ret;
				1214	}
				1215
				1216	/*
				1217	* send out a probe. Can be shared by rds_send_ping,
				1218	* rds_send_pong, rds_send_hb.
				1219	* rds_send_hb should use h_flags
				1220	* RDS_FLAG_HB_PING\|RDS_FLAG_ACK_REQUIRED
				1221	* or
				1222	* RDS_FLAG_HB_PONG\|RDS_FLAG_ACK_REQUIRED
				1223	*/
				1224	static int
				1225	rds_send_probe(struct rds_conn_path *cp, __be16 sport,
				1226	__be16 dport, u8 h_flags)
				1227	{
				1228	struct rds_message *rm;
				1229	unsigned long flags;
				1230	int ret = 0;
				1231
				1232	rm = rds_message_alloc(0, GFP_ATOMIC);
				1233	if (!rm) {
				1234	ret = -ENOMEM;
				1235	goto out;
				1236	}
				1237
				1238	rm->m_daddr = cp->cp_conn->c_faddr;
				1239	rm->data.op_active = 1;
				1240
				1241	rds_conn_path_connect_if_down(cp);
				1242
				1243	ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
				1244	if (ret)
				1245	goto out;
				1246
				1247	spin_lock_irqsave(&cp->cp_lock, flags);
				1248	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
				1249	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				1250	rds_message_addref(rm);
				1251	rm->m_inc.i_conn = cp->cp_conn;
				1252	rm->m_inc.i_conn_path = cp;
				1253
				1254	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
				1255	cp->cp_next_tx_seq);
				1256	rm->m_inc.i_hdr.h_flags \|= h_flags;
				1257	cp->cp_next_tx_seq++;
				1258
				1259	if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
				1260	cp->cp_conn->c_trans->t_mp_capable) {
				1261	u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
				1262	u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
				1263
				1264	rds_message_add_extension(&rm->m_inc.i_hdr,
				1265	RDS_EXTHDR_NPATHS, &npaths,
				1266	sizeof(npaths));
				1267	rds_message_add_extension(&rm->m_inc.i_hdr,
				1268	RDS_EXTHDR_GEN_NUM,
				1269	&my_gen_num,
				1270	sizeof(u32));
				1271	}
				1272	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1273
				1274	rds_stats_inc(s_send_queued);
				1275	rds_stats_inc(s_send_pong);
				1276
				1277	/* schedule the send work on rds_wq */
				1278	queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
				1279
				1280	rds_message_put(rm);
				1281	return 0;
				1282
				1283	out:
				1284	if (rm)
				1285	rds_message_put(rm);
				1286	return ret;
				1287	}
				1288
				1289	int
				1290	rds_send_pong(struct rds_conn_path *cp, __be16 dport)
				1291	{
				1292	return rds_send_probe(cp, 0, dport, 0);
				1293	}
				1294
				1295	void
				1296	rds_send_ping(struct rds_connection *conn, int cp_index)
				1297	{
				1298	unsigned long flags;
				1299	struct rds_conn_path *cp = &conn->c_path[cp_index];
				1300
				1301	spin_lock_irqsave(&cp->cp_lock, flags);
				1302	if (conn->c_ping_triggered) {
				1303	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1304	return;
				1305	}
				1306	conn->c_ping_triggered = 1;
				1307	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1308	rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0);
				1309	}
				1310	EXPORT_SYMBOL_GPL(rds_send_ping);