Blame - src/kernel/linux/v4.19/net/rds/send.c - T800

blob: 26e2c2305f7aaf4be9c49e41e4a4c932d8141843 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	/*
				2	* Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
				34	#include <linux/moduleparam.h>
				35	#include <linux/gfp.h>
				36	#include <net/sock.h>
				37	#include <linux/in.h>
				38	#include <linux/list.h>
				39	#include <linux/ratelimit.h>
				40	#include <linux/export.h>
				41	#include <linux/sizes.h>
				42
				43	#include "rds.h"
				44
				45	/* When transmitting messages in rds_send_xmit, we need to emerge from
				46	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				47	* will kick our shin.
				48	* Also, it seems fairer to not let one busy connection stall all the
				49	* others.
				50	*
				51	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				52	* it to 0 will restore the old behavior (where we looped until we had
				53	* drained the queue).
				54	*/
				55	static int send_batch_count = SZ_1K;
				56	module_param(send_batch_count, int, 0444);
				57	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				58
				59	static void rds_send_remove_from_sock(struct list_head *messages, int status);
				60
				61	/*
				62	* Reset the send state. Callers must ensure that this doesn't race with
				63	* rds_send_xmit().
				64	*/
				65	void rds_send_path_reset(struct rds_conn_path *cp)
				66	{
				67	struct rds_message rm, tmp;
				68	unsigned long flags;
				69
				70	if (cp->cp_xmit_rm) {
				71	rm = cp->cp_xmit_rm;
				72	cp->cp_xmit_rm = NULL;
				73	/* Tell the user the RDMA op is no longer mapped by the
				74	* transport. This isn't entirely true (it's flushed out
				75	* independently) but as the connection is down, there's
				76	* no ongoing RDMA to/from that memory */
				77	rds_message_unmapped(rm);
				78	rds_message_put(rm);
				79	}
				80
				81	cp->cp_xmit_sg = 0;
				82	cp->cp_xmit_hdr_off = 0;
				83	cp->cp_xmit_data_off = 0;
				84	cp->cp_xmit_atomic_sent = 0;
				85	cp->cp_xmit_rdma_sent = 0;
				86	cp->cp_xmit_data_sent = 0;
				87
				88	cp->cp_conn->c_map_queued = 0;
				89
				90	cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
				91	cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
				92
				93	/* Mark messages as retransmissions, and move them to the send q */
				94	spin_lock_irqsave(&cp->cp_lock, flags);
				95	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
				96	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				97	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				98	}
				99	list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
				100	spin_unlock_irqrestore(&cp->cp_lock, flags);
				101	}
				102	EXPORT_SYMBOL_GPL(rds_send_path_reset);
				103
				104	static int acquire_in_xmit(struct rds_conn_path *cp)
				105	{
				106	return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
				107	}
				108
				109	static void release_in_xmit(struct rds_conn_path *cp)
				110	{
				111	clear_bit(RDS_IN_XMIT, &cp->cp_flags);
				112	smp_mb__after_atomic();
				113	/*
				114	* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
				115	* hot path and finding waiters is very rare. We don't want to walk
				116	* the system-wide hashed waitqueue buckets in the fast path only to
				117	* almost never find waiters.
				118	*/
				119	if (waitqueue_active(&cp->cp_waitq))
				120	wake_up_all(&cp->cp_waitq);
				121	}
				122
				123	/*
				124	* We're making the conscious trade-off here to only send one message
				125	* down the connection at a time.
				126	* Pro:
				127	* - tx queueing is a simple fifo list
				128	* - reassembly is optional and easily done by transports per conn
				129	* - no per flow rx lookup at all, straight to the socket
				130	* - less per-frag memory and wire overhead
				131	* Con:
				132	* - queued acks can be delayed behind large messages
				133	* Depends:
				134	* - small message latency is higher behind queued large messages
				135	* - large message latency isn't starved by intervening small sends
				136	*/
				137	int rds_send_xmit(struct rds_conn_path *cp)
				138	{
				139	struct rds_connection *conn = cp->cp_conn;
				140	struct rds_message *rm;
				141	unsigned long flags;
				142	unsigned int tmp;
				143	struct scatterlist *sg;
				144	int ret = 0;
				145	LIST_HEAD(to_be_dropped);
				146	int batch_count;
				147	unsigned long send_gen = 0;
				148
				149	restart:
				150	batch_count = 0;
				151
				152	/*
				153	* sendmsg calls here after having queued its message on the send
				154	* queue. We only have one task feeding the connection at a time. If
				155	* another thread is already feeding the queue then we back off. This
				156	* avoids blocking the caller and trading per-connection data between
				157	* caches per message.
				158	*/
				159	if (!acquire_in_xmit(cp)) {
				160	rds_stats_inc(s_send_lock_contention);
				161	ret = -ENOMEM;
				162	goto out;
				163	}
				164
				165	if (rds_destroy_pending(cp->cp_conn)) {
				166	release_in_xmit(cp);
				167	ret = -ENETUNREACH; /* dont requeue send work */
				168	goto out;
				169	}
				170
				171	/*
				172	* we record the send generation after doing the xmit acquire.
				173	* if someone else manages to jump in and do some work, we'll use
				174	* this to avoid a goto restart farther down.
				175	*
				176	* The acquire_in_xmit() check above ensures that only one
				177	* caller can increment c_send_gen at any time.
				178	*/
				179	send_gen = READ_ONCE(cp->cp_send_gen) + 1;
				180	WRITE_ONCE(cp->cp_send_gen, send_gen);
				181
				182	/*
				183	* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
				184	* we do the opposite to avoid races.
				185	*/
				186	if (!rds_conn_path_up(cp)) {
				187	release_in_xmit(cp);
				188	ret = 0;
				189	goto out;
				190	}
				191
				192	if (conn->c_trans->xmit_path_prepare)
				193	conn->c_trans->xmit_path_prepare(cp);
				194
				195	/*
				196	* spin trying to push headers and data down the connection until
				197	* the connection doesn't make forward progress.
				198	*/
				199	while (1) {
				200
				201	rm = cp->cp_xmit_rm;
				202
				203	/*
				204	* If between sending messages, we can send a pending congestion
				205	* map update.
				206	*/
				207	if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
				208	rm = rds_cong_update_alloc(conn);
				209	if (IS_ERR(rm)) {
				210	ret = PTR_ERR(rm);
				211	break;
				212	}
				213	rm->data.op_active = 1;
				214	rm->m_inc.i_conn_path = cp;
				215	rm->m_inc.i_conn = cp->cp_conn;
				216
				217	cp->cp_xmit_rm = rm;
				218	}
				219
				220	/*
				221	* If not already working on one, grab the next message.
				222	*
				223	* cp_xmit_rm holds a ref while we're sending this message down
				224	* the connction. We can use this ref while holding the
				225	* send_sem.. rds_send_reset() is serialized with it.
				226	*/
				227	if (!rm) {
				228	unsigned int len;
				229
				230	batch_count++;
				231
				232	/* we want to process as big a batch as we can, but
				233	* we also want to avoid softlockups. If we've been
				234	* through a lot of messages, lets back off and see
				235	* if anyone else jumps in
				236	*/
				237	if (batch_count >= send_batch_count)
				238	goto over_batch;
				239
				240	spin_lock_irqsave(&cp->cp_lock, flags);
				241
				242	if (!list_empty(&cp->cp_send_queue)) {
				243	rm = list_entry(cp->cp_send_queue.next,
				244	struct rds_message,
				245	m_conn_item);
				246	rds_message_addref(rm);
				247
				248	/*
				249	* Move the message from the send queue to the retransmit
				250	* list right away.
				251	*/
				252	list_move_tail(&rm->m_conn_item,
				253	&cp->cp_retrans);
				254	}
				255
				256	spin_unlock_irqrestore(&cp->cp_lock, flags);
				257
				258	if (!rm)
				259	break;
				260
				261	/* Unfortunately, the way Infiniband deals with
				262	* RDMA to a bad MR key is by moving the entire
				263	* queue pair to error state. We cold possibly
				264	* recover from that, but right now we drop the
				265	* connection.
				266	* Therefore, we never retransmit messages with RDMA ops.
				267	*/
				268	if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) \|\|
				269	(rm->rdma.op_active &&
				270	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
				271	spin_lock_irqsave(&cp->cp_lock, flags);
				272	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				273	list_move(&rm->m_conn_item, &to_be_dropped);
				274	spin_unlock_irqrestore(&cp->cp_lock, flags);
				275	continue;
				276	}
				277
				278	/* Require an ACK every once in a while */
				279	len = ntohl(rm->m_inc.i_hdr.h_len);
				280	if (cp->cp_unacked_packets == 0 \|\|
				281	cp->cp_unacked_bytes < len) {
				282	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				283
				284	cp->cp_unacked_packets =
				285	rds_sysctl_max_unacked_packets;
				286	cp->cp_unacked_bytes =
				287	rds_sysctl_max_unacked_bytes;
				288	rds_stats_inc(s_send_ack_required);
				289	} else {
				290	cp->cp_unacked_bytes -= len;
				291	cp->cp_unacked_packets--;
				292	}
				293
				294	cp->cp_xmit_rm = rm;
				295	}
				296
				297	/* The transport either sends the whole rdma or none of it */
				298	if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
				299	rm->m_final_op = &rm->rdma;
				300	/* The transport owns the mapped memory for now.
				301	* You can't unmap it while it's on the send queue
				302	*/
				303	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				304	ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
				305	if (ret) {
				306	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
				307	wake_up_interruptible(&rm->m_flush_wait);
				308	break;
				309	}
				310	cp->cp_xmit_rdma_sent = 1;
				311
				312	}
				313
				314	if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
				315	rm->m_final_op = &rm->atomic;
				316	/* The transport owns the mapped memory for now.
				317	* You can't unmap it while it's on the send queue
				318	*/
				319	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				320	ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
				321	if (ret) {
				322	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
				323	wake_up_interruptible(&rm->m_flush_wait);
				324	break;
				325	}
				326	cp->cp_xmit_atomic_sent = 1;
				327
				328	}
				329
				330	/*
				331	* A number of cases require an RDS header to be sent
				332	* even if there is no data.
				333	* We permit 0-byte sends; rds-ping depends on this.
				334	* However, if there are exclusively attached silent ops,
				335	* we skip the hdr/data send, to enable silent operation.
				336	*/
				337	if (rm->data.op_nents == 0) {
				338	int ops_present;
				339	int all_ops_are_silent = 1;
				340
				341	ops_present = (rm->atomic.op_active \|\| rm->rdma.op_active);
				342	if (rm->atomic.op_active && !rm->atomic.op_silent)
				343	all_ops_are_silent = 0;
				344	if (rm->rdma.op_active && !rm->rdma.op_silent)
				345	all_ops_are_silent = 0;
				346
				347	if (ops_present && all_ops_are_silent
				348	&& !rm->m_rdma_cookie)
				349	rm->data.op_active = 0;
				350	}
				351
				352	if (rm->data.op_active && !cp->cp_xmit_data_sent) {
				353	rm->m_final_op = &rm->data;
				354
				355	ret = conn->c_trans->xmit(conn, rm,
				356	cp->cp_xmit_hdr_off,
				357	cp->cp_xmit_sg,
				358	cp->cp_xmit_data_off);
				359	if (ret <= 0)
				360	break;
				361
				362	if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
				363	tmp = min_t(int, ret,
				364	sizeof(struct rds_header) -
				365	cp->cp_xmit_hdr_off);
				366	cp->cp_xmit_hdr_off += tmp;
				367	ret -= tmp;
				368	}
				369
				370	sg = &rm->data.op_sg[cp->cp_xmit_sg];
				371	while (ret) {
				372	tmp = min_t(int, ret, sg->length -
				373	cp->cp_xmit_data_off);
				374	cp->cp_xmit_data_off += tmp;
				375	ret -= tmp;
				376	if (cp->cp_xmit_data_off == sg->length) {
				377	cp->cp_xmit_data_off = 0;
				378	sg++;
				379	cp->cp_xmit_sg++;
				380	BUG_ON(ret != 0 && cp->cp_xmit_sg ==
				381	rm->data.op_nents);
				382	}
				383	}
				384
				385	if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
				386	(cp->cp_xmit_sg == rm->data.op_nents))
				387	cp->cp_xmit_data_sent = 1;
				388	}
				389
				390	/*
				391	* A rm will only take multiple times through this loop
				392	* if there is a data op. Thus, if the data is sent (or there was
				393	* none), then we're done with the rm.
				394	*/
				395	if (!rm->data.op_active \|\| cp->cp_xmit_data_sent) {
				396	cp->cp_xmit_rm = NULL;
				397	cp->cp_xmit_sg = 0;
				398	cp->cp_xmit_hdr_off = 0;
				399	cp->cp_xmit_data_off = 0;
				400	cp->cp_xmit_rdma_sent = 0;
				401	cp->cp_xmit_atomic_sent = 0;
				402	cp->cp_xmit_data_sent = 0;
				403
				404	rds_message_put(rm);
				405	}
				406	}
				407
				408	over_batch:
				409	if (conn->c_trans->xmit_path_complete)
				410	conn->c_trans->xmit_path_complete(cp);
				411	release_in_xmit(cp);
				412
				413	/* Nuke any messages we decided not to retransmit. */
				414	if (!list_empty(&to_be_dropped)) {
				415	/* irqs on here, so we can put(), unlike above */
				416	list_for_each_entry(rm, &to_be_dropped, m_conn_item)
				417	rds_message_put(rm);
				418	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				419	}
				420
				421	/*
				422	* Other senders can queue a message after we last test the send queue
				423	* but before we clear RDS_IN_XMIT. In that case they'd back off and
				424	* not try and send their newly queued message. We need to check the
				425	* send queue after having cleared RDS_IN_XMIT so that their message
				426	* doesn't get stuck on the send queue.
				427	*
				428	* If the transport cannot continue (i.e ret != 0), then it must
				429	* call us when more room is available, such as from the tx
				430	* completion handler.
				431	*
				432	* We have an extra generation check here so that if someone manages
				433	* to jump in after our release_in_xmit, we'll see that they have done
				434	* some work and we will skip our goto
				435	*/
				436	if (ret == 0) {
				437	bool raced;
				438
				439	smp_mb();
				440	raced = send_gen != READ_ONCE(cp->cp_send_gen);
				441
				442	if ((test_bit(0, &conn->c_map_queued) \|\|
				443	!list_empty(&cp->cp_send_queue)) && !raced) {
				444	if (batch_count < send_batch_count)
				445	goto restart;
				446	rcu_read_lock();
				447	if (rds_destroy_pending(cp->cp_conn))
				448	ret = -ENETUNREACH;
				449	else
				450	queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
				451	rcu_read_unlock();
				452	} else if (raced) {
				453	rds_stats_inc(s_send_lock_queue_raced);
				454	}
				455	}
				456	out:
				457	return ret;
				458	}
				459	EXPORT_SYMBOL_GPL(rds_send_xmit);
				460
				461	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				462	{
				463	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				464
				465	assert_spin_locked(&rs->rs_lock);
				466
				467	BUG_ON(rs->rs_snd_bytes < len);
				468	rs->rs_snd_bytes -= len;
				469
				470	if (rs->rs_snd_bytes == 0)
				471	rds_stats_inc(s_send_queue_empty);
				472	}
				473
				474	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				475	is_acked_func is_acked)
				476	{
				477	if (is_acked)
				478	return is_acked(rm, ack);
				479	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				480	}
				481
				482	/*
				483	* This is pretty similar to what happens below in the ACK
				484	* handling code - except that we call here as soon as we get
				485	* the IB send completion on the RDMA op and the accompanying
				486	* message.
				487	*/
				488	void rds_rdma_send_complete(struct rds_message *rm, int status)
				489	{
				490	struct rds_sock *rs = NULL;
				491	struct rm_rdma_op *ro;
				492	struct rds_notifier *notifier;
				493	unsigned long flags;
				494	unsigned int notify = 0;
				495
				496	spin_lock_irqsave(&rm->m_rs_lock, flags);
				497
				498	notify = rm->rdma.op_notify \| rm->data.op_notify;
				499	ro = &rm->rdma;
				500	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
				501	ro->op_active && notify && ro->op_notifier) {
				502	notifier = ro->op_notifier;
				503	rs = rm->m_rs;
				504	sock_hold(rds_rs_to_sk(rs));
				505
				506	notifier->n_status = status;
				507	spin_lock(&rs->rs_lock);
				508	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				509	spin_unlock(&rs->rs_lock);
				510
				511	ro->op_notifier = NULL;
				512	}
				513
				514	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				515
				516	if (rs) {
				517	rds_wake_sk_sleep(rs);
				518	sock_put(rds_rs_to_sk(rs));
				519	}
				520	}
				521	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
				522
				523	/*
				524	* Just like above, except looks at atomic op
				525	*/
				526	void rds_atomic_send_complete(struct rds_message *rm, int status)
				527	{
				528	struct rds_sock *rs = NULL;
				529	struct rm_atomic_op *ao;
				530	struct rds_notifier *notifier;
				531	unsigned long flags;
				532
				533	spin_lock_irqsave(&rm->m_rs_lock, flags);
				534
				535	ao = &rm->atomic;
				536	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
				537	&& ao->op_active && ao->op_notify && ao->op_notifier) {
				538	notifier = ao->op_notifier;
				539	rs = rm->m_rs;
				540	sock_hold(rds_rs_to_sk(rs));
				541
				542	notifier->n_status = status;
				543	spin_lock(&rs->rs_lock);
				544	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				545	spin_unlock(&rs->rs_lock);
				546
				547	ao->op_notifier = NULL;
				548	}
				549
				550	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				551
				552	if (rs) {
				553	rds_wake_sk_sleep(rs);
				554	sock_put(rds_rs_to_sk(rs));
				555	}
				556	}
				557	EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
				558
				559	/*
				560	* This is the same as rds_rdma_send_complete except we
				561	* don't do any locking - we have all the ingredients (message,
				562	* socket, socket lock) and can just move the notifier.
				563	*/
				564	static inline void
				565	__rds_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				566	{
				567	struct rm_rdma_op *ro;
				568	struct rm_atomic_op *ao;
				569
				570	ro = &rm->rdma;
				571	if (ro->op_active && ro->op_notify && ro->op_notifier) {
				572	ro->op_notifier->n_status = status;
				573	list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
				574	ro->op_notifier = NULL;
				575	}
				576
				577	ao = &rm->atomic;
				578	if (ao->op_active && ao->op_notify && ao->op_notifier) {
				579	ao->op_notifier->n_status = status;
				580	list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
				581	ao->op_notifier = NULL;
				582	}
				583
				584	/* No need to wake the app - caller does this */
				585	}
				586
				587	/*
				588	* This removes messages from the socket's list if they're on it. The list
				589	* argument must be private to the caller, we must be able to modify it
				590	* without locks. The messages must have a reference held for their
				591	* position on the list. This function will drop that reference after
				592	* removing the messages from the 'messages' list regardless of if it found
				593	* the messages on the socket list or not.
				594	*/
				595	static void rds_send_remove_from_sock(struct list_head *messages, int status)
				596	{
				597	unsigned long flags;
				598	struct rds_sock *rs = NULL;
				599	struct rds_message *rm;
				600
				601	while (!list_empty(messages)) {
				602	int was_on_sock = 0;
				603
				604	rm = list_entry(messages->next, struct rds_message,
				605	m_conn_item);
				606	list_del_init(&rm->m_conn_item);
				607
				608	/*
				609	* If we see this flag cleared then we're sure that someone
				610	* else beat us to removing it from the sock. If we race
				611	* with their flag update we'll get the lock and then really
				612	* see that the flag has been cleared.
				613	*
				614	* The message spinlock makes sure nobody clears rm->m_rs
				615	* while we're messing with it. It does not prevent the
				616	* message from being removed from the socket, though.
				617	*/
				618	spin_lock_irqsave(&rm->m_rs_lock, flags);
				619	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				620	goto unlock_and_drop;
				621
				622	if (rs != rm->m_rs) {
				623	if (rs) {
				624	rds_wake_sk_sleep(rs);
				625	sock_put(rds_rs_to_sk(rs));
				626	}
				627	rs = rm->m_rs;
				628	if (rs)
				629	sock_hold(rds_rs_to_sk(rs));
				630	}
				631	if (!rs)
				632	goto unlock_and_drop;
				633	spin_lock(&rs->rs_lock);
				634
				635	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				636	struct rm_rdma_op *ro = &rm->rdma;
				637	struct rds_notifier *notifier;
				638
				639	list_del_init(&rm->m_sock_item);
				640	rds_send_sndbuf_remove(rs, rm);
				641
				642	if (ro->op_active && ro->op_notifier &&
				643	(ro->op_notify \|\| (ro->op_recverr && status))) {
				644	notifier = ro->op_notifier;
				645	list_add_tail(&notifier->n_list,
				646	&rs->rs_notify_queue);
				647	if (!notifier->n_status)
				648	notifier->n_status = status;
				649	rm->rdma.op_notifier = NULL;
				650	}
				651	was_on_sock = 1;
				652	}
				653	spin_unlock(&rs->rs_lock);
				654
				655	unlock_and_drop:
				656	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				657	rds_message_put(rm);
				658	if (was_on_sock)
				659	rds_message_put(rm);
				660	}
				661
				662	if (rs) {
				663	rds_wake_sk_sleep(rs);
				664	sock_put(rds_rs_to_sk(rs));
				665	}
				666	}
				667
				668	/*
				669	* Transports call here when they've determined that the receiver queued
				670	* messages up to, and including, the given sequence number. Messages are
				671	* moved to the retrans queue when rds_send_xmit picks them off the send
				672	* queue. This means that in the TCP case, the message may not have been
				673	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				674	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				675	*/
				676	void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
				677	is_acked_func is_acked)
				678	{
				679	struct rds_message rm, tmp;
				680	unsigned long flags;
				681	LIST_HEAD(list);
				682
				683	spin_lock_irqsave(&cp->cp_lock, flags);
				684
				685	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
				686	if (!rds_send_is_acked(rm, ack, is_acked))
				687	break;
				688
				689	list_move(&rm->m_conn_item, &list);
				690	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				691	}
				692
				693	/* order flag updates with spin locks */
				694	if (!list_empty(&list))
				695	smp_mb__after_atomic();
				696
				697	spin_unlock_irqrestore(&cp->cp_lock, flags);
				698
				699	/* now remove the messages from the sock list as needed */
				700	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				701	}
				702	EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
				703
				704	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				705	is_acked_func is_acked)
				706	{
				707	WARN_ON(conn->c_trans->t_mp_capable);
				708	rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
				709	}
				710	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
				711
				712	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in6 dest)
				713	{
				714	struct rds_message rm, tmp;
				715	struct rds_connection *conn;
				716	struct rds_conn_path *cp;
				717	unsigned long flags;
				718	LIST_HEAD(list);
				719
				720	/* get all the messages we're dropping under the rs lock */
				721	spin_lock_irqsave(&rs->rs_lock, flags);
				722
				723	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				724	if (dest &&
				725	(!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) \|\|
				726	dest->sin6_port != rm->m_inc.i_hdr.h_dport))
				727	continue;
				728
				729	list_move(&rm->m_sock_item, &list);
				730	rds_send_sndbuf_remove(rs, rm);
				731	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				732	}
				733
				734	/* order flag updates with the rs lock */
				735	smp_mb__after_atomic();
				736
				737	spin_unlock_irqrestore(&rs->rs_lock, flags);
				738
				739	if (list_empty(&list))
				740	return;
				741
				742	/* Remove the messages from the conn */
				743	list_for_each_entry(rm, &list, m_sock_item) {
				744
				745	conn = rm->m_inc.i_conn;
				746	if (conn->c_trans->t_mp_capable)
				747	cp = rm->m_inc.i_conn_path;
				748	else
				749	cp = &conn->c_path[0];
				750
				751	spin_lock_irqsave(&cp->cp_lock, flags);
				752	/*
				753	* Maybe someone else beat us to removing rm from the conn.
				754	* If we race with their flag update we'll get the lock and
				755	* then really see that the flag has been cleared.
				756	*/
				757	if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				758	spin_unlock_irqrestore(&cp->cp_lock, flags);
				759	continue;
				760	}
				761	list_del_init(&rm->m_conn_item);
				762	spin_unlock_irqrestore(&cp->cp_lock, flags);
				763
				764	/*
				765	* Couldn't grab m_rs_lock in top loop (lock ordering),
				766	* but we can now.
				767	*/
				768	spin_lock_irqsave(&rm->m_rs_lock, flags);
				769
				770	spin_lock(&rs->rs_lock);
				771	__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
				772	spin_unlock(&rs->rs_lock);
				773
				774	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				775
				776	rds_message_put(rm);
				777	}
				778
				779	rds_wake_sk_sleep(rs);
				780
				781	while (!list_empty(&list)) {
				782	rm = list_entry(list.next, struct rds_message, m_sock_item);
				783	list_del_init(&rm->m_sock_item);
				784	rds_message_wait(rm);
				785
				786	/* just in case the code above skipped this message
				787	* because RDS_MSG_ON_CONN wasn't set, run it again here
				788	* taking m_rs_lock is the only thing that keeps us
				789	* from racing with ack processing.
				790	*/
				791	spin_lock_irqsave(&rm->m_rs_lock, flags);
				792
				793	spin_lock(&rs->rs_lock);
				794	__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
				795	spin_unlock(&rs->rs_lock);
				796
				797	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
				798
				799	rds_message_put(rm);
				800	}
				801	}
				802
				803	/*
				804	* we only want this to fire once so we use the callers 'queued'. It's
				805	* possible that another thread can race with us and remove the
				806	* message from the flow with RDS_CANCEL_SENT_TO.
				807	*/
				808	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				809	struct rds_conn_path *cp,
				810	struct rds_message *rm, __be16 sport,
				811	__be16 dport, int *queued)
				812	{
				813	unsigned long flags;
				814	u32 len;
				815
				816	if (*queued)
				817	goto out;
				818
				819	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				820
				821	/* this is the only place which holds both the socket's rs_lock
				822	* and the connection's c_lock */
				823	spin_lock_irqsave(&rs->rs_lock, flags);
				824
				825	/*
				826	* If there is a little space in sndbuf, we don't queue anything,
				827	* and userspace gets -EAGAIN. But poll() indicates there's send
				828	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				829	* freed up by incoming acks. So we check the old value of
				830	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				831	* and poll() now knows no more data can be sent.
				832	*/
				833	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				834	rs->rs_snd_bytes += len;
				835
				836	/* let recv side know we are close to send space exhaustion.
				837	* This is probably not the optimal way to do it, as this
				838	* means we set the flag on all messages as soon as our
				839	* throughput hits a certain threshold.
				840	*/
				841	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				842	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				843
				844	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				845	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				846	rds_message_addref(rm);
				847	sock_hold(rds_rs_to_sk(rs));
				848	rm->m_rs = rs;
				849
				850	/* The code ordering is a little weird, but we're
				851	trying to minimize the time we hold c_lock */
				852	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				853	rm->m_inc.i_conn = conn;
				854	rm->m_inc.i_conn_path = cp;
				855	rds_message_addref(rm);
				856
				857	spin_lock(&cp->cp_lock);
				858	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
				859	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
				860	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				861	spin_unlock(&cp->cp_lock);
				862
				863	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				864	rm, len, rs, rs->rs_snd_bytes,
				865	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				866
				867	*queued = 1;
				868	}
				869
				870	spin_unlock_irqrestore(&rs->rs_lock, flags);
				871	out:
				872	return *queued;
				873	}
				874
				875	/*
				876	* rds_message is getting to be quite complicated, and we'd like to allocate
				877	* it all in one go. This figures out how big it needs to be up front.
				878	*/
				879	static int rds_rm_size(struct msghdr *msg, int num_sgs,
				880	struct rds_iov_vector_arr *vct)
				881	{
				882	struct cmsghdr *cmsg;
				883	int size = 0;
				884	int cmsg_groups = 0;
				885	int retval;
				886	bool zcopy_cookie = false;
				887	struct rds_iov_vector iov, tmp_iov;
				888
				889	for_each_cmsghdr(cmsg, msg) {
				890	if (!CMSG_OK(msg, cmsg))
				891	return -EINVAL;
				892
				893	if (cmsg->cmsg_level != SOL_RDS)
				894	continue;
				895
				896	switch (cmsg->cmsg_type) {
				897	case RDS_CMSG_RDMA_ARGS:
				898	if (vct->indx >= vct->len) {
				899	vct->len += vct->incr;
				900	tmp_iov =
				901	krealloc(vct->vec,
				902	vct->len *
				903	sizeof(struct rds_iov_vector),
				904	GFP_KERNEL);
				905	if (!tmp_iov) {
				906	vct->len -= vct->incr;
				907	return -ENOMEM;
				908	}
				909	vct->vec = tmp_iov;
				910	}
				911	iov = &vct->vec[vct->indx];
				912	memset(iov, 0, sizeof(struct rds_iov_vector));
				913	vct->indx++;
				914	cmsg_groups \|= 1;
				915	retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov);
				916	if (retval < 0)
				917	return retval;
				918	size += retval;
				919
				920	break;
				921
				922	case RDS_CMSG_ZCOPY_COOKIE:
				923	zcopy_cookie = true;
				924	/* fall through */
				925
				926	case RDS_CMSG_RDMA_DEST:
				927	case RDS_CMSG_RDMA_MAP:
				928	cmsg_groups \|= 2;
				929	/* these are valid but do no add any size */
				930	break;
				931
				932	case RDS_CMSG_ATOMIC_CSWP:
				933	case RDS_CMSG_ATOMIC_FADD:
				934	case RDS_CMSG_MASKED_ATOMIC_CSWP:
				935	case RDS_CMSG_MASKED_ATOMIC_FADD:
				936	cmsg_groups \|= 1;
				937	size += sizeof(struct scatterlist);
				938	break;
				939
				940	default:
				941	return -EINVAL;
				942	}
				943
				944	}
				945
				946	if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
				947	return -EINVAL;
				948
				949	size += num_sgs * sizeof(struct scatterlist);
				950
				951	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
				952	if (cmsg_groups == 3)
				953	return -EINVAL;
				954
				955	return size;
				956	}
				957
				958	static int rds_cmsg_zcopy(struct rds_sock rs, struct rds_message rm,
				959	struct cmsghdr *cmsg)
				960	{
				961	u32 *cookie;
				962
				963	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) \|\|
				964	!rm->data.op_mmp_znotifier)
				965	return -EINVAL;
				966	cookie = CMSG_DATA(cmsg);
				967	rm->data.op_mmp_znotifier->z_cookie = *cookie;
				968	return 0;
				969	}
				970
				971	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				972	struct msghdr msg, int allocated_mr,
				973	struct rds_iov_vector_arr *vct)
				974	{
				975	struct cmsghdr *cmsg;
				976	int ret = 0, ind = 0;
				977
				978	for_each_cmsghdr(cmsg, msg) {
				979	if (!CMSG_OK(msg, cmsg))
				980	return -EINVAL;
				981
				982	if (cmsg->cmsg_level != SOL_RDS)
				983	continue;
				984
				985	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				986	* rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
				987	*/
				988	switch (cmsg->cmsg_type) {
				989	case RDS_CMSG_RDMA_ARGS:
				990	if (ind >= vct->indx)
				991	return -ENOMEM;
				992	ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]);
				993	ind++;
				994	break;
				995
				996	case RDS_CMSG_RDMA_DEST:
				997	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				998	break;
				999
				1000	case RDS_CMSG_RDMA_MAP:
				1001	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				1002	if (!ret)
				1003	*allocated_mr = 1;
				1004	else if (ret == -ENODEV)
				1005	/* Accommodate the get_mr() case which can fail
				1006	* if connection isn't established yet.
				1007	*/
				1008	ret = -EAGAIN;
				1009	break;
				1010	case RDS_CMSG_ATOMIC_CSWP:
				1011	case RDS_CMSG_ATOMIC_FADD:
				1012	case RDS_CMSG_MASKED_ATOMIC_CSWP:
				1013	case RDS_CMSG_MASKED_ATOMIC_FADD:
				1014	ret = rds_cmsg_atomic(rs, rm, cmsg);
				1015	break;
				1016
				1017	case RDS_CMSG_ZCOPY_COOKIE:
				1018	ret = rds_cmsg_zcopy(rs, rm, cmsg);
				1019	break;
				1020
				1021	default:
				1022	return -EINVAL;
				1023	}
				1024
				1025	if (ret)
				1026	break;
				1027	}
				1028
				1029	return ret;
				1030	}
				1031
				1032	static int rds_send_mprds_hash(struct rds_sock *rs,
				1033	struct rds_connection *conn, int nonblock)
				1034	{
				1035	int hash;
				1036
				1037	if (conn->c_npaths == 0)
				1038	hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
				1039	else
				1040	hash = RDS_MPATH_HASH(rs, conn->c_npaths);
				1041	if (conn->c_npaths == 0 && hash != 0) {
				1042	rds_send_ping(conn, 0);
				1043
				1044	/* The underlying connection is not up yet. Need to wait
				1045	* until it is up to be sure that the non-zero c_path can be
				1046	* used. But if we are interrupted, we have to use the zero
				1047	* c_path in case the connection ends up being non-MP capable.
				1048	*/
				1049	if (conn->c_npaths == 0) {
				1050	/* Cannot wait for the connection be made, so just use
				1051	* the base c_path.
				1052	*/
				1053	if (nonblock)
				1054	return 0;
				1055	if (wait_event_interruptible(conn->c_hs_waitq,
				1056	conn->c_npaths != 0))
				1057	hash = 0;
				1058	}
				1059	if (conn->c_npaths == 1)
				1060	hash = 0;
				1061	}
				1062	return hash;
				1063	}
				1064
				1065	static int rds_rdma_bytes(struct msghdr msg, size_t rdma_bytes)
				1066	{
				1067	struct rds_rdma_args *args;
				1068	struct cmsghdr *cmsg;
				1069
				1070	for_each_cmsghdr(cmsg, msg) {
				1071	if (!CMSG_OK(msg, cmsg))
				1072	return -EINVAL;
				1073
				1074	if (cmsg->cmsg_level != SOL_RDS)
				1075	continue;
				1076
				1077	if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
				1078	if (cmsg->cmsg_len <
				1079	CMSG_LEN(sizeof(struct rds_rdma_args)))
				1080	return -EINVAL;
				1081	args = CMSG_DATA(cmsg);
				1082	*rdma_bytes += args->remote_vec.bytes;
				1083	}
				1084	}
				1085	return 0;
				1086	}
				1087
				1088	int rds_sendmsg(struct socket sock, struct msghdr msg, size_t payload_len)
				1089	{
				1090	struct sock *sk = sock->sk;
				1091	struct rds_sock *rs = rds_sk_to_rs(sk);
				1092	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
				1093	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
				1094	__be16 dport;
				1095	struct rds_message *rm = NULL;
				1096	struct rds_connection *conn;
				1097	int ret = 0;
				1098	int queued = 0, allocated_mr = 0;
				1099	int nonblock = msg->msg_flags & MSG_DONTWAIT;
				1100	long timeo = sock_sndtimeo(sk, nonblock);
				1101	struct rds_conn_path *cpath;
				1102	struct in6_addr daddr;
				1103	__u32 scope_id = 0;
				1104	size_t total_payload_len = payload_len, rdma_payload_len = 0;
				1105	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
				1106	sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
				1107	int num_sgs = ceil(payload_len, PAGE_SIZE);
				1108	int namelen;
				1109	struct rds_iov_vector_arr vct;
				1110	int ind;
				1111
				1112	memset(&vct, 0, sizeof(vct));
				1113
				1114	/* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */
				1115	vct.incr = 1;
				1116
				1117	/* Mirror Linux UDP mirror of BSD error message compatibility */
				1118	/* XXX: Perhaps MSG_MORE someday */
				1119	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT \| MSG_ZEROCOPY)) {
				1120	ret = -EOPNOTSUPP;
				1121	goto out;
				1122	}
				1123
				1124	namelen = msg->msg_namelen;
				1125	if (namelen != 0) {
				1126	if (namelen < sizeof(*usin)) {
				1127	ret = -EINVAL;
				1128	goto out;
				1129	}
				1130	switch (usin->sin_family) {
				1131	case AF_INET:
				1132	if (usin->sin_addr.s_addr == htonl(INADDR_ANY) \|\|
				1133	usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) \|\|
				1134	IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
				1135	ret = -EINVAL;
				1136	goto out;
				1137	}
				1138	ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
				1139	dport = usin->sin_port;
				1140	break;
				1141
				1142	#if IS_ENABLED(CONFIG_IPV6)
				1143	case AF_INET6: {
				1144	int addr_type;
				1145
				1146	if (namelen < sizeof(*sin6)) {
				1147	ret = -EINVAL;
				1148	goto out;
				1149	}
				1150	addr_type = ipv6_addr_type(&sin6->sin6_addr);
				1151	if (!(addr_type & IPV6_ADDR_UNICAST)) {
				1152	__be32 addr4;
				1153
				1154	if (!(addr_type & IPV6_ADDR_MAPPED)) {
				1155	ret = -EINVAL;
				1156	goto out;
				1157	}
				1158
				1159	/* It is a mapped address. Need to do some
				1160	* sanity checks.
				1161	*/
				1162	addr4 = sin6->sin6_addr.s6_addr32[3];
				1163	if (addr4 == htonl(INADDR_ANY) \|\|
				1164	addr4 == htonl(INADDR_BROADCAST) \|\|
				1165	IN_MULTICAST(ntohl(addr4))) {
				1166	ret = -EINVAL;
				1167	goto out;
				1168	}
				1169	}
				1170	if (addr_type & IPV6_ADDR_LINKLOCAL) {
				1171	if (sin6->sin6_scope_id == 0) {
				1172	ret = -EINVAL;
				1173	goto out;
				1174	}
				1175	scope_id = sin6->sin6_scope_id;
				1176	}
				1177
				1178	daddr = sin6->sin6_addr;
				1179	dport = sin6->sin6_port;
				1180	break;
				1181	}
				1182	#endif
				1183
				1184	default:
				1185	ret = -EINVAL;
				1186	goto out;
				1187	}
				1188	} else {
				1189	/* We only care about consistency with ->connect() */
				1190	lock_sock(sk);
				1191	daddr = rs->rs_conn_addr;
				1192	dport = rs->rs_conn_port;
				1193	scope_id = rs->rs_bound_scope_id;
				1194	release_sock(sk);
				1195	}
				1196
				1197	lock_sock(sk);
				1198	if (ipv6_addr_any(&rs->rs_bound_addr) \|\| ipv6_addr_any(&daddr)) {
				1199	release_sock(sk);
				1200	ret = -ENOTCONN;
				1201	goto out;
				1202	} else if (namelen != 0) {
				1203	/* Cannot send to an IPv4 address using an IPv6 source
				1204	* address and cannot send to an IPv6 address using an
				1205	* IPv4 source address.
				1206	*/
				1207	if (ipv6_addr_v4mapped(&daddr) ^
				1208	ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
				1209	release_sock(sk);
				1210	ret = -EOPNOTSUPP;
				1211	goto out;
				1212	}
				1213	/* If the socket is already bound to a link local address,
				1214	* it can only send to peers on the same link. But allow
				1215	* communicating beween link local and non-link local address.
				1216	*/
				1217	if (scope_id != rs->rs_bound_scope_id) {
				1218	if (!scope_id) {
				1219	scope_id = rs->rs_bound_scope_id;
				1220	} else if (rs->rs_bound_scope_id) {
				1221	release_sock(sk);
				1222	ret = -EINVAL;
				1223	goto out;
				1224	}
				1225	}
				1226	}
				1227	release_sock(sk);
				1228
				1229	ret = rds_rdma_bytes(msg, &rdma_payload_len);
				1230	if (ret)
				1231	goto out;
				1232
				1233	total_payload_len += rdma_payload_len;
				1234	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
				1235	ret = -EMSGSIZE;
				1236	goto out;
				1237	}
				1238
				1239	if (payload_len > rds_sk_sndbuf(rs)) {
				1240	ret = -EMSGSIZE;
				1241	goto out;
				1242	}
				1243
				1244	if (zcopy) {
				1245	if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
				1246	ret = -EOPNOTSUPP;
				1247	goto out;
				1248	}
				1249	num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
				1250	}
				1251	/* size of rm including all sgs */
				1252	ret = rds_rm_size(msg, num_sgs, &vct);
				1253	if (ret < 0)
				1254	goto out;
				1255
				1256	rm = rds_message_alloc(ret, GFP_KERNEL);
				1257	if (!rm) {
				1258	ret = -ENOMEM;
				1259	goto out;
				1260	}
				1261
				1262	/* Attach data to the rm */
				1263	if (payload_len) {
				1264	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
				1265	if (!rm->data.op_sg) {
				1266	ret = -ENOMEM;
				1267	goto out;
				1268	}
				1269	ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
				1270	if (ret)
				1271	goto out;
				1272	}
				1273	rm->data.op_active = 1;
				1274
				1275	rm->m_daddr = daddr;
				1276
				1277	/* rds_conn_create has a spinlock that runs with IRQ off.
				1278	* Caching the conn in the socket helps a lot. */
				1279	if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
				1280	conn = rs->rs_conn;
				1281	else {
				1282	conn = rds_conn_create_outgoing(sock_net(sock->sk),
				1283	&rs->rs_bound_addr, &daddr,
				1284	rs->rs_transport,
				1285	sock->sk->sk_allocation,
				1286	scope_id);
				1287	if (IS_ERR(conn)) {
				1288	ret = PTR_ERR(conn);
				1289	goto out;
				1290	}
				1291	rs->rs_conn = conn;
				1292	}
				1293
				1294	if (conn->c_trans->t_mp_capable)
				1295	cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
				1296	else
				1297	cpath = &conn->c_path[0];
				1298
				1299	rm->m_conn_path = cpath;
				1300
				1301	/* Parse any control messages the user may have included. */
				1302	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct);
				1303	if (ret) {
				1304	/* Trigger connection so that its ready for the next retry */
				1305	if (ret == -EAGAIN)
				1306	rds_conn_connect_if_down(conn);
				1307	goto out;
				1308	}
				1309
				1310	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
				1311	printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				1312	&rm->rdma, conn->c_trans->xmit_rdma);
				1313	ret = -EOPNOTSUPP;
				1314	goto out;
				1315	}
				1316
				1317	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
				1318	printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
				1319	&rm->atomic, conn->c_trans->xmit_atomic);
				1320	ret = -EOPNOTSUPP;
				1321	goto out;
				1322	}
				1323
				1324	if (rds_destroy_pending(conn)) {
				1325	ret = -EAGAIN;
				1326	goto out;
				1327	}
				1328
				1329	rds_conn_path_connect_if_down(cpath);
				1330
				1331	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
				1332	if (ret) {
				1333	rs->rs_seen_congestion = 1;
				1334	goto out;
				1335	}
				1336	while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
				1337	dport, &queued)) {
				1338	rds_stats_inc(s_send_queue_full);
				1339
				1340	if (nonblock) {
				1341	ret = -EAGAIN;
				1342	goto out;
				1343	}
				1344
				1345	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
				1346	rds_send_queue_rm(rs, conn, cpath, rm,
				1347	rs->rs_bound_port,
				1348	dport,
				1349	&queued),
				1350	timeo);
				1351	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				1352	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				1353	continue;
				1354
				1355	ret = timeo;
				1356	if (ret == 0)
				1357	ret = -ETIMEDOUT;
				1358	goto out;
				1359	}
				1360
				1361	/*
				1362	* By now we've committed to the send. We reuse rds_send_worker()
				1363	* to retry sends in the rds thread if the transport asks us to.
				1364	*/
				1365	rds_stats_inc(s_send_queued);
				1366
				1367	ret = rds_send_xmit(cpath);
				1368	if (ret == -ENOMEM \|\| ret == -EAGAIN) {
				1369	ret = 0;
				1370	rcu_read_lock();
				1371	if (rds_destroy_pending(cpath->cp_conn))
				1372	ret = -ENETUNREACH;
				1373	else
				1374	queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
				1375	rcu_read_unlock();
				1376	}
				1377	if (ret)
				1378	goto out;
				1379	rds_message_put(rm);
				1380
				1381	for (ind = 0; ind < vct.indx; ind++)
				1382	kfree(vct.vec[ind].iov);
				1383	kfree(vct.vec);
				1384
				1385	return payload_len;
				1386
				1387	out:
				1388	for (ind = 0; ind < vct.indx; ind++)
				1389	kfree(vct.vec[ind].iov);
				1390	kfree(vct.vec);
				1391
				1392	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				1393	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				1394	* or in any other way, we need to destroy the MR again */
				1395	if (allocated_mr)
				1396	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				1397
				1398	if (rm)
				1399	rds_message_put(rm);
				1400	return ret;
				1401	}
				1402
				1403	/*
				1404	* send out a probe. Can be shared by rds_send_ping,
				1405	* rds_send_pong, rds_send_hb.
				1406	* rds_send_hb should use h_flags
				1407	* RDS_FLAG_HB_PING\|RDS_FLAG_ACK_REQUIRED
				1408	* or
				1409	* RDS_FLAG_HB_PONG\|RDS_FLAG_ACK_REQUIRED
				1410	*/
				1411	static int
				1412	rds_send_probe(struct rds_conn_path *cp, __be16 sport,
				1413	__be16 dport, u8 h_flags)
				1414	{
				1415	struct rds_message *rm;
				1416	unsigned long flags;
				1417	int ret = 0;
				1418
				1419	rm = rds_message_alloc(0, GFP_ATOMIC);
				1420	if (!rm) {
				1421	ret = -ENOMEM;
				1422	goto out;
				1423	}
				1424
				1425	rm->m_daddr = cp->cp_conn->c_faddr;
				1426	rm->data.op_active = 1;
				1427
				1428	rds_conn_path_connect_if_down(cp);
				1429
				1430	ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
				1431	if (ret)
				1432	goto out;
				1433
				1434	spin_lock_irqsave(&cp->cp_lock, flags);
				1435	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
				1436	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				1437	rds_message_addref(rm);
				1438	rm->m_inc.i_conn = cp->cp_conn;
				1439	rm->m_inc.i_conn_path = cp;
				1440
				1441	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
				1442	cp->cp_next_tx_seq);
				1443	rm->m_inc.i_hdr.h_flags \|= h_flags;
				1444	cp->cp_next_tx_seq++;
				1445
				1446	if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
				1447	cp->cp_conn->c_trans->t_mp_capable) {
				1448	u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
				1449	u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
				1450
				1451	rds_message_add_extension(&rm->m_inc.i_hdr,
				1452	RDS_EXTHDR_NPATHS, &npaths,
				1453	sizeof(npaths));
				1454	rds_message_add_extension(&rm->m_inc.i_hdr,
				1455	RDS_EXTHDR_GEN_NUM,
				1456	&my_gen_num,
				1457	sizeof(u32));
				1458	}
				1459	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1460
				1461	rds_stats_inc(s_send_queued);
				1462	rds_stats_inc(s_send_pong);
				1463
				1464	/* schedule the send work on rds_wq */
				1465	rcu_read_lock();
				1466	if (!rds_destroy_pending(cp->cp_conn))
				1467	queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
				1468	rcu_read_unlock();
				1469
				1470	rds_message_put(rm);
				1471	return 0;
				1472
				1473	out:
				1474	if (rm)
				1475	rds_message_put(rm);
				1476	return ret;
				1477	}
				1478
				1479	int
				1480	rds_send_pong(struct rds_conn_path *cp, __be16 dport)
				1481	{
				1482	return rds_send_probe(cp, 0, dport, 0);
				1483	}
				1484
				1485	void
				1486	rds_send_ping(struct rds_connection *conn, int cp_index)
				1487	{
				1488	unsigned long flags;
				1489	struct rds_conn_path *cp = &conn->c_path[cp_index];
				1490
				1491	spin_lock_irqsave(&cp->cp_lock, flags);
				1492	if (conn->c_ping_triggered) {
				1493	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1494	return;
				1495	}
				1496	conn->c_ping_triggered = 1;
				1497	spin_unlock_irqrestore(&cp->cp_lock, flags);
				1498	rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0);
				1499	}
				1500	EXPORT_SYMBOL_GPL(rds_send_ping);