Blame - marvell/linux/drivers/infiniband/sw/rdmavt/qp.c - T108

blob: 905e2eaed09514812597edf7bd98752024a801f6 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Copyright(c) 2016 - 2019 Intel Corporation.
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of version 2 of the GNU General Public License as
				11	* published by the Free Software Foundation.
				12	*
				13	* This program is distributed in the hope that it will be useful, but
				14	* WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				16	* General Public License for more details.
				17	*
				18	* BSD LICENSE
				19	*
				20	* Redistribution and use in source and binary forms, with or without
				21	* modification, are permitted provided that the following conditions
				22	* are met:
				23	*
				24	* - Redistributions of source code must retain the above copyright
				25	* notice, this list of conditions and the following disclaimer.
				26	* - Redistributions in binary form must reproduce the above copyright
				27	* notice, this list of conditions and the following disclaimer in
				28	* the documentation and/or other materials provided with the
				29	* distribution.
				30	* - Neither the name of Intel Corporation nor the names of its
				31	* contributors may be used to endorse or promote products derived
				32	* from this software without specific prior written permission.
				33	*
				34	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				35	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				36	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				37	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				38	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				39	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				40	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				41	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				42	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				43	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				44	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				45	*
				46	*/
				47
				48	#include <linux/hash.h>
				49	#include <linux/bitops.h>
				50	#include <linux/lockdep.h>
				51	#include <linux/vmalloc.h>
				52	#include <linux/slab.h>
				53	#include <rdma/ib_verbs.h>
				54	#include <rdma/ib_hdrs.h>
				55	#include <rdma/opa_addr.h>
				56	#include <rdma/uverbs_ioctl.h>
				57	#include "qp.h"
				58	#include "vt.h"
				59	#include "trace.h"
				60
				61	#define RVT_RWQ_COUNT_THRESHOLD 16
				62
				63	static void rvt_rc_timeout(struct timer_list *t);
				64	static void rvt_reset_qp(struct rvt_dev_info rdi, struct rvt_qp qp,
				65	enum ib_qp_type type);
				66
				67	/*
				68	* Convert the AETH RNR timeout code into the number of microseconds.
				69	*/
				70	static const u32 ib_rvt_rnr_table[32] = {
				71	655360, /* 00: 655.36 */
				72	10, /* 01: .01 */
				73	20, /* 02 .02 */
				74	30, /* 03: .03 */
				75	40, /* 04: .04 */
				76	60, /* 05: .06 */
				77	80, /* 06: .08 */
				78	120, /* 07: .12 */
				79	160, /* 08: .16 */
				80	240, /* 09: .24 */
				81	320, /* 0A: .32 */
				82	480, /* 0B: .48 */
				83	640, /* 0C: .64 */
				84	960, /* 0D: .96 */
				85	1280, /* 0E: 1.28 */
				86	1920, /* 0F: 1.92 */
				87	2560, /* 10: 2.56 */
				88	3840, /* 11: 3.84 */
				89	5120, /* 12: 5.12 */
				90	7680, /* 13: 7.68 */
				91	10240, /* 14: 10.24 */
				92	15360, /* 15: 15.36 */
				93	20480, /* 16: 20.48 */
				94	30720, /* 17: 30.72 */
				95	40960, /* 18: 40.96 */
				96	61440, /* 19: 61.44 */
				97	81920, /* 1A: 81.92 */
				98	122880, /* 1B: 122.88 */
				99	163840, /* 1C: 163.84 */
				100	245760, /* 1D: 245.76 */
				101	327680, /* 1E: 327.68 */
				102	491520 /* 1F: 491.52 */
				103	};
				104
				105	/*
				106	* Note that it is OK to post send work requests in the SQE and ERR
				107	* states; rvt_do_send() will process them and generate error
				108	* completions as per IB 1.2 C10-96.
				109	*/
				110	const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
				111	[IB_QPS_RESET] = 0,
				112	[IB_QPS_INIT] = RVT_POST_RECV_OK,
				113	[IB_QPS_RTR] = RVT_POST_RECV_OK \| RVT_PROCESS_RECV_OK,
				114	[IB_QPS_RTS] = RVT_POST_RECV_OK \| RVT_PROCESS_RECV_OK \|
				115	RVT_POST_SEND_OK \| RVT_PROCESS_SEND_OK \|
				116	RVT_PROCESS_NEXT_SEND_OK,
				117	[IB_QPS_SQD] = RVT_POST_RECV_OK \| RVT_PROCESS_RECV_OK \|
				118	RVT_POST_SEND_OK \| RVT_PROCESS_SEND_OK,
				119	[IB_QPS_SQE] = RVT_POST_RECV_OK \| RVT_PROCESS_RECV_OK \|
				120	RVT_POST_SEND_OK \| RVT_FLUSH_SEND,
				121	[IB_QPS_ERR] = RVT_POST_RECV_OK \| RVT_FLUSH_RECV \|
				122	RVT_POST_SEND_OK \| RVT_FLUSH_SEND,
				123	};
				124	EXPORT_SYMBOL(ib_rvt_state_ops);
				125
				126	/* platform specific: return the last level cache (llc) size, in KiB */
				127	static int rvt_wss_llc_size(void)
				128	{
				129	/* assume that the boot CPU value is universal for all CPUs */
				130	return boot_cpu_data.x86_cache_size;
				131	}
				132
				133	/* platform specific: cacheless copy */
				134	static void cacheless_memcpy(void dst, void src, size_t n)
				135	{
				136	/*
				137	* Use the only available X64 cacheless copy. Add a __user cast
				138	* to quiet sparse. The src agument is already in the kernel so
				139	* there are no security issues. The extra fault recovery machinery
				140	* is not invoked.
				141	*/
				142	__copy_user_nocache(dst, (void __user *)src, n, 0);
				143	}
				144
				145	void rvt_wss_exit(struct rvt_dev_info *rdi)
				146	{
				147	struct rvt_wss *wss = rdi->wss;
				148
				149	if (!wss)
				150	return;
				151
				152	/* coded to handle partially initialized and repeat callers */
				153	kfree(wss->entries);
				154	wss->entries = NULL;
				155	kfree(rdi->wss);
				156	rdi->wss = NULL;
				157	}
				158
				159	/**
				160	* rvt_wss_init - Init wss data structures
				161	*
				162	* Return: 0 on success
				163	*/
				164	int rvt_wss_init(struct rvt_dev_info *rdi)
				165	{
				166	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
				167	unsigned int wss_threshold = rdi->dparms.wss_threshold;
				168	unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
				169	long llc_size;
				170	long llc_bits;
				171	long table_size;
				172	long table_bits;
				173	struct rvt_wss *wss;
				174	int node = rdi->dparms.node;
				175
				176	if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
				177	rdi->wss = NULL;
				178	return 0;
				179	}
				180
				181	rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
				182	if (!rdi->wss)
				183	return -ENOMEM;
				184	wss = rdi->wss;
				185
				186	/* check for a valid percent range - default to 80 if none or invalid */
				187	if (wss_threshold < 1 \|\| wss_threshold > 100)
				188	wss_threshold = 80;
				189
				190	/* reject a wildly large period */
				191	if (wss_clean_period > 1000000)
				192	wss_clean_period = 256;
				193
				194	/* reject a zero period */
				195	if (wss_clean_period == 0)
				196	wss_clean_period = 1;
				197
				198	/*
				199	* Calculate the table size - the next power of 2 larger than the
				200	* LLC size. LLC size is in KiB.
				201	*/
				202	llc_size = rvt_wss_llc_size() * 1024;
				203	table_size = roundup_pow_of_two(llc_size);
				204
				205	/* one bit per page in rounded up table */
				206	llc_bits = llc_size / PAGE_SIZE;
				207	table_bits = table_size / PAGE_SIZE;
				208	wss->pages_mask = table_bits - 1;
				209	wss->num_entries = table_bits / BITS_PER_LONG;
				210
				211	wss->threshold = (llc_bits * wss_threshold) / 100;
				212	if (wss->threshold == 0)
				213	wss->threshold = 1;
				214
				215	wss->clean_period = wss_clean_period;
				216	atomic_set(&wss->clean_counter, wss_clean_period);
				217
				218	wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
				219	GFP_KERNEL, node);
				220	if (!wss->entries) {
				221	rvt_wss_exit(rdi);
				222	return -ENOMEM;
				223	}
				224
				225	return 0;
				226	}
				227
				228	/*
				229	* Advance the clean counter. When the clean period has expired,
				230	* clean an entry.
				231	*
				232	* This is implemented in atomics to avoid locking. Because multiple
				233	* variables are involved, it can be racy which can lead to slightly
				234	* inaccurate information. Since this is only a heuristic, this is
				235	* OK. Any innaccuracies will clean themselves out as the counter
				236	* advances. That said, it is unlikely the entry clean operation will
				237	* race - the next possible racer will not start until the next clean
				238	* period.
				239	*
				240	* The clean counter is implemented as a decrement to zero. When zero
				241	* is reached an entry is cleaned.
				242	*/
				243	static void wss_advance_clean_counter(struct rvt_wss *wss)
				244	{
				245	int entry;
				246	int weight;
				247	unsigned long bits;
				248
				249	/* become the cleaner if we decrement the counter to zero */
				250	if (atomic_dec_and_test(&wss->clean_counter)) {
				251	/*
				252	* Set, not add, the clean period. This avoids an issue
				253	* where the counter could decrement below the clean period.
				254	* Doing a set can result in lost decrements, slowing the
				255	* clean advance. Since this a heuristic, this possible
				256	* slowdown is OK.
				257	*
				258	* An alternative is to loop, advancing the counter by a
				259	* clean period until the result is > 0. However, this could
				260	* lead to several threads keeping another in the clean loop.
				261	* This could be mitigated by limiting the number of times
				262	* we stay in the loop.
				263	*/
				264	atomic_set(&wss->clean_counter, wss->clean_period);
				265
				266	/*
				267	* Uniquely grab the entry to clean and move to next.
				268	* The current entry is always the lower bits of
				269	* wss.clean_entry. The table size, wss.num_entries,
				270	* is always a power-of-2.
				271	*/
				272	entry = (atomic_inc_return(&wss->clean_entry) - 1)
				273	& (wss->num_entries - 1);
				274
				275	/* clear the entry and count the bits */
				276	bits = xchg(&wss->entries[entry], 0);
				277	weight = hweight64((u64)bits);
				278	/* only adjust the contended total count if needed */
				279	if (weight)
				280	atomic_sub(weight, &wss->total_count);
				281	}
				282	}
				283
				284	/*
				285	* Insert the given address into the working set array.
				286	*/
				287	static void wss_insert(struct rvt_wss wss, void address)
				288	{
				289	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
				290	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
				291	u32 nr = page & (BITS_PER_LONG - 1);
				292
				293	if (!test_and_set_bit(nr, &wss->entries[entry]))
				294	atomic_inc(&wss->total_count);
				295
				296	wss_advance_clean_counter(wss);
				297	}
				298
				299	/*
				300	* Is the working set larger than the threshold?
				301	*/
				302	static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
				303	{
				304	return atomic_read(&wss->total_count) >= wss->threshold;
				305	}
				306
				307	static void get_map_page(struct rvt_qpn_table *qpt,
				308	struct rvt_qpn_map *map)
				309	{
				310	unsigned long page = get_zeroed_page(GFP_KERNEL);
				311
				312	/*
				313	* Free the page if someone raced with us installing it.
				314	*/
				315
				316	spin_lock(&qpt->lock);
				317	if (map->page)
				318	free_page(page);
				319	else
				320	map->page = (void *)page;
				321	spin_unlock(&qpt->lock);
				322	}
				323
				324	/**
				325	* init_qpn_table - initialize the QP number table for a device
				326	* @qpt: the QPN table
				327	*/
				328	static int init_qpn_table(struct rvt_dev_info rdi, struct rvt_qpn_table qpt)
				329	{
				330	u32 offset, i;
				331	struct rvt_qpn_map *map;
				332	int ret = 0;
				333
				334	if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
				335	return -EINVAL;
				336
				337	spin_lock_init(&qpt->lock);
				338
				339	qpt->last = rdi->dparms.qpn_start;
				340	qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
				341
				342	/*
				343	* Drivers may want some QPs beyond what we need for verbs let them use
				344	* our qpn table. No need for two. Lets go ahead and mark the bitmaps
				345	* for those. The reserved range must be after the range which verbs
				346	* will pick from.
				347	*/
				348
				349	/* Figure out number of bit maps needed before reserved range */
				350	qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
				351
				352	/* This should always be zero */
				353	offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
				354
				355	/* Starting with the first reserved bit map */
				356	map = &qpt->map[qpt->nmaps];
				357
				358	rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
				359	rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
				360	for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
				361	if (!map->page) {
				362	get_map_page(qpt, map);
				363	if (!map->page) {
				364	ret = -ENOMEM;
				365	break;
				366	}
				367	}
				368	set_bit(offset, map->page);
				369	offset++;
				370	if (offset == RVT_BITS_PER_PAGE) {
				371	/* next page */
				372	qpt->nmaps++;
				373	map++;
				374	offset = 0;
				375	}
				376	}
				377	return ret;
				378	}
				379
				380	/**
				381	* free_qpn_table - free the QP number table for a device
				382	* @qpt: the QPN table
				383	*/
				384	static void free_qpn_table(struct rvt_qpn_table *qpt)
				385	{
				386	int i;
				387
				388	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
				389	free_page((unsigned long)qpt->map[i].page);
				390	}
				391
				392	/**
				393	* rvt_driver_qp_init - Init driver qp resources
				394	* @rdi: rvt dev strucutre
				395	*
				396	* Return: 0 on success
				397	*/
				398	int rvt_driver_qp_init(struct rvt_dev_info *rdi)
				399	{
				400	int i;
				401	int ret = -ENOMEM;
				402
				403	if (!rdi->dparms.qp_table_size)
				404	return -EINVAL;
				405
				406	/*
				407	* If driver is not doing any QP allocation then make sure it is
				408	* providing the necessary QP functions.
				409	*/
				410	if (!rdi->driver_f.free_all_qps \|\|
				411	!rdi->driver_f.qp_priv_alloc \|\|
				412	!rdi->driver_f.qp_priv_free \|\|
				413	!rdi->driver_f.notify_qp_reset \|\|
				414	!rdi->driver_f.notify_restart_rc)
				415	return -EINVAL;
				416
				417	/* allocate parent object */
				418	rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
				419	rdi->dparms.node);
				420	if (!rdi->qp_dev)
				421	return -ENOMEM;
				422
				423	/* allocate hash table */
				424	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
				425	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
				426	rdi->qp_dev->qp_table =
				427	kmalloc_array_node(rdi->qp_dev->qp_table_size,
				428	sizeof(*rdi->qp_dev->qp_table),
				429	GFP_KERNEL, rdi->dparms.node);
				430	if (!rdi->qp_dev->qp_table)
				431	goto no_qp_table;
				432
				433	for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
				434	RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
				435
				436	spin_lock_init(&rdi->qp_dev->qpt_lock);
				437
				438	/* initialize qpn map */
				439	if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
				440	goto fail_table;
				441
				442	spin_lock_init(&rdi->n_qps_lock);
				443
				444	return 0;
				445
				446	fail_table:
				447	kfree(rdi->qp_dev->qp_table);
				448	free_qpn_table(&rdi->qp_dev->qpn_table);
				449
				450	no_qp_table:
				451	kfree(rdi->qp_dev);
				452
				453	return ret;
				454	}
				455
				456	/**
				457	* rvt_free_qp_cb - callback function to reset a qp
				458	* @qp: the qp to reset
				459	* @v: a 64-bit value
				460	*
				461	* This function resets the qp and removes it from the
				462	* qp hash table.
				463	*/
				464	static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v)
				465	{
				466	unsigned int qp_inuse = (unsigned int )v;
				467	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				468
				469	/* Reset the qp and remove it from the qp hash list */
				470	rvt_reset_qp(rdi, qp, qp->ibqp.qp_type);
				471
				472	/* Increment the qp_inuse count */
				473	(*qp_inuse)++;
				474	}
				475
				476	/**
				477	* rvt_free_all_qps - check for QPs still in use
				478	* @rdi: rvt device info structure
				479	*
				480	* There should not be any QPs still in use.
				481	* Free memory for table.
				482	* Return the number of QPs still in use.
				483	*/
				484	static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
				485	{
				486	unsigned int qp_inuse = 0;
				487
				488	qp_inuse += rvt_mcast_tree_empty(rdi);
				489
				490	rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb);
				491
				492	return qp_inuse;
				493	}
				494
				495	/**
				496	* rvt_qp_exit - clean up qps on device exit
				497	* @rdi: rvt dev structure
				498	*
				499	* Check for qp leaks and free resources.
				500	*/
				501	void rvt_qp_exit(struct rvt_dev_info *rdi)
				502	{
				503	u32 qps_inuse = rvt_free_all_qps(rdi);
				504
				505	if (qps_inuse)
				506	rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
				507	qps_inuse);
				508
				509	kfree(rdi->qp_dev->qp_table);
				510	free_qpn_table(&rdi->qp_dev->qpn_table);
				511	kfree(rdi->qp_dev);
				512	}
				513
				514	static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
				515	struct rvt_qpn_map *map, unsigned off)
				516	{
				517	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
				518	}
				519
				520	/**
				521	* alloc_qpn - Allocate the next available qpn or zero/one for QP type
				522	* IB_QPT_SMI/IB_QPT_GSI
				523	* @rdi: rvt device info structure
				524	* @qpt: queue pair number table pointer
				525	* @port_num: IB port number, 1 based, comes from core
				526	*
				527	* Return: The queue pair number
				528	*/
				529	static int alloc_qpn(struct rvt_dev_info rdi, struct rvt_qpn_table qpt,
				530	enum ib_qp_type type, u8 port_num)
				531	{
				532	u32 i, offset, max_scan, qpn;
				533	struct rvt_qpn_map *map;
				534	u32 ret;
				535
				536	if (rdi->driver_f.alloc_qpn)
				537	return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
				538
				539	if (type == IB_QPT_SMI \|\| type == IB_QPT_GSI) {
				540	unsigned n;
				541
				542	ret = type == IB_QPT_GSI;
				543	n = 1 << (ret + 2 * (port_num - 1));
				544	spin_lock(&qpt->lock);
				545	if (qpt->flags & n)
				546	ret = -EINVAL;
				547	else
				548	qpt->flags \|= n;
				549	spin_unlock(&qpt->lock);
				550	goto bail;
				551	}
				552
				553	qpn = qpt->last + qpt->incr;
				554	if (qpn >= RVT_QPN_MAX)
				555	qpn = qpt->incr \| ((qpt->last & 1) ^ 1);
				556	/* offset carries bit 0 */
				557	offset = qpn & RVT_BITS_PER_PAGE_MASK;
				558	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
				559	max_scan = qpt->nmaps - !offset;
				560	for (i = 0;;) {
				561	if (unlikely(!map->page)) {
				562	get_map_page(qpt, map);
				563	if (unlikely(!map->page))
				564	break;
				565	}
				566	do {
				567	if (!test_and_set_bit(offset, map->page)) {
				568	qpt->last = qpn;
				569	ret = qpn;
				570	goto bail;
				571	}
				572	offset += qpt->incr;
				573	/*
				574	* This qpn might be bogus if offset >= BITS_PER_PAGE.
				575	* That is OK. It gets re-assigned below
				576	*/
				577	qpn = mk_qpn(qpt, map, offset);
				578	} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
				579	/*
				580	* In order to keep the number of pages allocated to a
				581	* minimum, we scan the all existing pages before increasing
				582	* the size of the bitmap table.
				583	*/
				584	if (++i > max_scan) {
				585	if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
				586	break;
				587	map = &qpt->map[qpt->nmaps++];
				588	/* start at incr with current bit 0 */
				589	offset = qpt->incr \| (offset & 1);
				590	} else if (map < &qpt->map[qpt->nmaps]) {
				591	++map;
				592	/* start at incr with current bit 0 */
				593	offset = qpt->incr \| (offset & 1);
				594	} else {
				595	map = &qpt->map[0];
				596	/* wrap to first map page, invert bit 0 */
				597	offset = qpt->incr \| ((offset & 1) ^ 1);
				598	}
				599	/* there can be no set bits in low-order QoS bits */
				600	WARN_ON(rdi->dparms.qos_shift > 1 &&
				601	offset & ((BIT(rdi->dparms.qos_shift - 1) - 1) << 1));
				602	qpn = mk_qpn(qpt, map, offset);
				603	}
				604
				605	ret = -ENOMEM;
				606
				607	bail:
				608	return ret;
				609	}
				610
				611	/**
				612	* rvt_clear_mr_refs - Drop help mr refs
				613	* @qp: rvt qp data structure
				614	* @clr_sends: If shoudl clear send side or not
				615	*/
				616	static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
				617	{
				618	unsigned n;
				619	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				620
				621	if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
				622	rvt_put_ss(&qp->s_rdma_read_sge);
				623
				624	rvt_put_ss(&qp->r_sge);
				625
				626	if (clr_sends) {
				627	while (qp->s_last != qp->s_head) {
				628	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
				629
				630	rvt_put_qp_swqe(qp, wqe);
				631	if (++qp->s_last >= qp->s_size)
				632	qp->s_last = 0;
				633	smp_wmb(); /* see qp_set_savail */
				634	}
				635	if (qp->s_rdma_mr) {
				636	rvt_put_mr(qp->s_rdma_mr);
				637	qp->s_rdma_mr = NULL;
				638	}
				639	}
				640
				641	for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
				642	struct rvt_ack_entry *e = &qp->s_ack_queue[n];
				643
				644	if (e->rdma_sge.mr) {
				645	rvt_put_mr(e->rdma_sge.mr);
				646	e->rdma_sge.mr = NULL;
				647	}
				648	}
				649	}
				650
				651	/**
				652	* rvt_swqe_has_lkey - return true if lkey is used by swqe
				653	* @wqe - the send wqe
				654	* @lkey - the lkey
				655	*
				656	* Test the swqe for using lkey
				657	*/
				658	static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
				659	{
				660	int i;
				661
				662	for (i = 0; i < wqe->wr.num_sge; i++) {
				663	struct rvt_sge *sge = &wqe->sg_list[i];
				664
				665	if (rvt_mr_has_lkey(sge->mr, lkey))
				666	return true;
				667	}
				668	return false;
				669	}
				670
				671	/**
				672	* rvt_qp_sends_has_lkey - return true is qp sends use lkey
				673	* @qp - the rvt_qp
				674	* @lkey - the lkey
				675	*/
				676	static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
				677	{
				678	u32 s_last = qp->s_last;
				679
				680	while (s_last != qp->s_head) {
				681	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
				682
				683	if (rvt_swqe_has_lkey(wqe, lkey))
				684	return true;
				685
				686	if (++s_last >= qp->s_size)
				687	s_last = 0;
				688	}
				689	if (qp->s_rdma_mr)
				690	if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
				691	return true;
				692	return false;
				693	}
				694
				695	/**
				696	* rvt_qp_acks_has_lkey - return true if acks have lkey
				697	* @qp - the qp
				698	* @lkey - the lkey
				699	*/
				700	static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
				701	{
				702	int i;
				703	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				704
				705	for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
				706	struct rvt_ack_entry *e = &qp->s_ack_queue[i];
				707
				708	if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
				709	return true;
				710	}
				711	return false;
				712	}
				713
				714	/*
				715	* rvt_qp_mr_clean - clean up remote ops for lkey
				716	* @qp - the qp
				717	* @lkey - the lkey that is being de-registered
				718	*
				719	* This routine checks if the lkey is being used by
				720	* the qp.
				721	*
				722	* If so, the qp is put into an error state to elminate
				723	* any references from the qp.
				724	*/
				725	void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
				726	{
				727	bool lastwqe = false;
				728
				729	if (qp->ibqp.qp_type == IB_QPT_SMI \|\|
				730	qp->ibqp.qp_type == IB_QPT_GSI)
				731	/* avoid special QPs */
				732	return;
				733	spin_lock_irq(&qp->r_lock);
				734	spin_lock(&qp->s_hlock);
				735	spin_lock(&qp->s_lock);
				736
				737	if (qp->state == IB_QPS_ERR \|\| qp->state == IB_QPS_RESET)
				738	goto check_lwqe;
				739
				740	if (rvt_ss_has_lkey(&qp->r_sge, lkey) \|\|
				741	rvt_qp_sends_has_lkey(qp, lkey) \|\|
				742	rvt_qp_acks_has_lkey(qp, lkey))
				743	lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
				744	check_lwqe:
				745	spin_unlock(&qp->s_lock);
				746	spin_unlock(&qp->s_hlock);
				747	spin_unlock_irq(&qp->r_lock);
				748	if (lastwqe) {
				749	struct ib_event ev;
				750
				751	ev.device = qp->ibqp.device;
				752	ev.element.qp = &qp->ibqp;
				753	ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
				754	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
				755	}
				756	}
				757
				758	/**
				759	* rvt_remove_qp - remove qp form table
				760	* @rdi: rvt dev struct
				761	* @qp: qp to remove
				762	*
				763	* Remove the QP from the table so it can't be found asynchronously by
				764	* the receive routine.
				765	*/
				766	static void rvt_remove_qp(struct rvt_dev_info rdi, struct rvt_qp qp)
				767	{
				768	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
				769	u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
				770	unsigned long flags;
				771	int removed = 1;
				772
				773	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
				774
				775	if (rcu_dereference_protected(rvp->qp[0],
				776	lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
				777	RCU_INIT_POINTER(rvp->qp[0], NULL);
				778	} else if (rcu_dereference_protected(rvp->qp[1],
				779	lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
				780	RCU_INIT_POINTER(rvp->qp[1], NULL);
				781	} else {
				782	struct rvt_qp *q;
				783	struct rvt_qp __rcu **qpp;
				784
				785	removed = 0;
				786	qpp = &rdi->qp_dev->qp_table[n];
				787	for (; (q = rcu_dereference_protected(*qpp,
				788	lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
				789	qpp = &q->next) {
				790	if (q == qp) {
				791	RCU_INIT_POINTER(*qpp,
				792	rcu_dereference_protected(qp->next,
				793	lockdep_is_held(&rdi->qp_dev->qpt_lock)));
				794	removed = 1;
				795	trace_rvt_qpremove(qp, n);
				796	break;
				797	}
				798	}
				799	}
				800
				801	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
				802	if (removed) {
				803	synchronize_rcu();
				804	rvt_put_qp(qp);
				805	}
				806	}
				807
				808	/**
				809	* rvt_alloc_rq - allocate memory for user or kernel buffer
				810	* @rq: receive queue data structure
				811	* @size: number of request queue entries
				812	* @node: The NUMA node
				813	* @udata: True if user data is available or not false
				814	*
				815	* Return: If memory allocation failed, return -ENONEM
				816	* This function is used by both shared receive
				817	* queues and non-shared receive queues to allocate
				818	* memory.
				819	*/
				820	int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
				821	struct ib_udata *udata)
				822	{
				823	if (udata) {
				824	rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
				825	if (!rq->wq)
				826	goto bail;
				827	/* need kwq with no buffers */
				828	rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
				829	if (!rq->kwq)
				830	goto bail;
				831	rq->kwq->curr_wq = rq->wq->wq;
				832	} else {
				833	/* need kwq with buffers */
				834	rq->kwq =
				835	vzalloc_node(sizeof(struct rvt_krwq) + size, node);
				836	if (!rq->kwq)
				837	goto bail;
				838	rq->kwq->curr_wq = rq->kwq->wq;
				839	}
				840
				841	spin_lock_init(&rq->kwq->p_lock);
				842	spin_lock_init(&rq->kwq->c_lock);
				843	return 0;
				844	bail:
				845	rvt_free_rq(rq);
				846	return -ENOMEM;
				847	}
				848
				849	/**
				850	* rvt_init_qp - initialize the QP state to the reset state
				851	* @qp: the QP to init or reinit
				852	* @type: the QP type
				853	*
				854	* This function is called from both rvt_create_qp() and
				855	* rvt_reset_qp(). The difference is that the reset
				856	* patch the necessary locks to protect against concurent
				857	* access.
				858	*/
				859	static void rvt_init_qp(struct rvt_dev_info rdi, struct rvt_qp qp,
				860	enum ib_qp_type type)
				861	{
				862	qp->remote_qpn = 0;
				863	qp->qkey = 0;
				864	qp->qp_access_flags = 0;
				865	qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
				866	qp->s_hdrwords = 0;
				867	qp->s_wqe = NULL;
				868	qp->s_draining = 0;
				869	qp->s_next_psn = 0;
				870	qp->s_last_psn = 0;
				871	qp->s_sending_psn = 0;
				872	qp->s_sending_hpsn = 0;
				873	qp->s_psn = 0;
				874	qp->r_psn = 0;
				875	qp->r_msn = 0;
				876	if (type == IB_QPT_RC) {
				877	qp->s_state = IB_OPCODE_RC_SEND_LAST;
				878	qp->r_state = IB_OPCODE_RC_SEND_LAST;
				879	} else {
				880	qp->s_state = IB_OPCODE_UC_SEND_LAST;
				881	qp->r_state = IB_OPCODE_UC_SEND_LAST;
				882	}
				883	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
				884	qp->r_nak_state = 0;
				885	qp->r_aflags = 0;
				886	qp->r_flags = 0;
				887	qp->s_head = 0;
				888	qp->s_tail = 0;
				889	qp->s_cur = 0;
				890	qp->s_acked = 0;
				891	qp->s_last = 0;
				892	qp->s_ssn = 1;
				893	qp->s_lsn = 0;
				894	qp->s_mig_state = IB_MIG_MIGRATED;
				895	qp->r_head_ack_queue = 0;
				896	qp->s_tail_ack_queue = 0;
				897	qp->s_acked_ack_queue = 0;
				898	qp->s_num_rd_atomic = 0;
				899	qp->r_sge.num_sge = 0;
				900	atomic_set(&qp->s_reserved_used, 0);
				901	}
				902
				903	/**
				904	* _rvt_reset_qp - initialize the QP state to the reset state
				905	* @qp: the QP to reset
				906	* @type: the QP type
				907	*
				908	* r_lock, s_hlock, and s_lock are required to be held by the caller
				909	*/
				910	static void _rvt_reset_qp(struct rvt_dev_info rdi, struct rvt_qp qp,
				911	enum ib_qp_type type)
				912	__must_hold(&qp->s_lock)
				913	__must_hold(&qp->s_hlock)
				914	__must_hold(&qp->r_lock)
				915	{
				916	lockdep_assert_held(&qp->r_lock);
				917	lockdep_assert_held(&qp->s_hlock);
				918	lockdep_assert_held(&qp->s_lock);
				919	if (qp->state != IB_QPS_RESET) {
				920	qp->state = IB_QPS_RESET;
				921
				922	/* Let drivers flush their waitlist */
				923	rdi->driver_f.flush_qp_waiters(qp);
				924	rvt_stop_rc_timers(qp);
				925	qp->s_flags &= ~(RVT_S_TIMER \| RVT_S_ANY_WAIT);
				926	spin_unlock(&qp->s_lock);
				927	spin_unlock(&qp->s_hlock);
				928	spin_unlock_irq(&qp->r_lock);
				929
				930	/* Stop the send queue and the retry timer */
				931	rdi->driver_f.stop_send_queue(qp);
				932	rvt_del_timers_sync(qp);
				933	/* Wait for things to stop */
				934	rdi->driver_f.quiesce_qp(qp);
				935
				936	/* take qp out the hash and wait for it to be unused */
				937	rvt_remove_qp(rdi, qp);
				938
				939	/* grab the lock b/c it was locked at call time */
				940	spin_lock_irq(&qp->r_lock);
				941	spin_lock(&qp->s_hlock);
				942	spin_lock(&qp->s_lock);
				943
				944	rvt_clear_mr_refs(qp, 1);
				945	/*
				946	* Let the driver do any tear down or re-init it needs to for
				947	* a qp that has been reset
				948	*/
				949	rdi->driver_f.notify_qp_reset(qp);
				950	}
				951	rvt_init_qp(rdi, qp, type);
				952	lockdep_assert_held(&qp->r_lock);
				953	lockdep_assert_held(&qp->s_hlock);
				954	lockdep_assert_held(&qp->s_lock);
				955	}
				956
				957	/**
				958	* rvt_reset_qp - initialize the QP state to the reset state
				959	* @rdi: the device info
				960	* @qp: the QP to reset
				961	* @type: the QP type
				962	*
				963	* This is the wrapper function to acquire the r_lock, s_hlock, and s_lock
				964	* before calling _rvt_reset_qp().
				965	*/
				966	static void rvt_reset_qp(struct rvt_dev_info rdi, struct rvt_qp qp,
				967	enum ib_qp_type type)
				968	{
				969	spin_lock_irq(&qp->r_lock);
				970	spin_lock(&qp->s_hlock);
				971	spin_lock(&qp->s_lock);
				972	_rvt_reset_qp(rdi, qp, type);
				973	spin_unlock(&qp->s_lock);
				974	spin_unlock(&qp->s_hlock);
				975	spin_unlock_irq(&qp->r_lock);
				976	}
				977
				978	/** rvt_free_qpn - Free a qpn from the bit map
				979	* @qpt: QP table
				980	* @qpn: queue pair number to free
				981	*/
				982	static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
				983	{
				984	struct rvt_qpn_map *map;
				985
				986	map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
				987	if (map->page)
				988	clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
				989	}
				990
				991	/**
				992	* get_allowed_ops - Given a QP type return the appropriate allowed OP
				993	* @type: valid, supported, QP type
				994	*/
				995	static u8 get_allowed_ops(enum ib_qp_type type)
				996	{
				997	return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
				998	IB_OPCODE_UC : IB_OPCODE_UD;
				999	}
				1000
				1001	/**
				1002	* free_ud_wq_attr - Clean up AH attribute cache for UD QPs
				1003	* @qp: Valid QP with allowed_ops set
				1004	*
				1005	* The rvt_swqe data structure being used is a union, so this is
				1006	* only valid for UD QPs.
				1007	*/
				1008	static void free_ud_wq_attr(struct rvt_qp *qp)
				1009	{
				1010	struct rvt_swqe *wqe;
				1011	int i;
				1012
				1013	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
				1014	wqe = rvt_get_swqe_ptr(qp, i);
				1015	kfree(wqe->ud_wr.attr);
				1016	wqe->ud_wr.attr = NULL;
				1017	}
				1018	}
				1019
				1020	/**
				1021	* alloc_ud_wq_attr - AH attribute cache for UD QPs
				1022	* @qp: Valid QP with allowed_ops set
				1023	* @node: Numa node for allocation
				1024	*
				1025	* The rvt_swqe data structure being used is a union, so this is
				1026	* only valid for UD QPs.
				1027	*/
				1028	static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
				1029	{
				1030	struct rvt_swqe *wqe;
				1031	int i;
				1032
				1033	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
				1034	wqe = rvt_get_swqe_ptr(qp, i);
				1035	wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
				1036	GFP_KERNEL, node);
				1037	if (!wqe->ud_wr.attr) {
				1038	free_ud_wq_attr(qp);
				1039	return -ENOMEM;
				1040	}
				1041	}
				1042
				1043	return 0;
				1044	}
				1045
				1046	/**
				1047	* rvt_create_qp - create a queue pair for a device
				1048	* @ibpd: the protection domain who's device we create the queue pair for
				1049	* @init_attr: the attributes of the queue pair
				1050	* @udata: user data for libibverbs.so
				1051	*
				1052	* Queue pair creation is mostly an rvt issue. However, drivers have their own
				1053	* unique idea of what queue pair numbers mean. For instance there is a reserved
				1054	* range for PSM.
				1055	*
				1056	* Return: the queue pair on success, otherwise returns an errno.
				1057	*
				1058	* Called by the ib_create_qp() core verbs function.
				1059	*/
				1060	struct ib_qp rvt_create_qp(struct ib_pd ibpd,
				1061	struct ib_qp_init_attr *init_attr,
				1062	struct ib_udata *udata)
				1063	{
				1064	struct rvt_qp *qp;
				1065	int err;
				1066	struct rvt_swqe *swq = NULL;
				1067	size_t sz;
				1068	size_t sg_list_sz;
				1069	struct ib_qp *ret = ERR_PTR(-ENOMEM);
				1070	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
				1071	void *priv = NULL;
				1072	size_t sqsize;
				1073
				1074	if (!rdi)
				1075	return ERR_PTR(-EINVAL);
				1076
				1077	if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge \|\|
				1078	init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr \|\|
				1079	init_attr->create_flags)
				1080	return ERR_PTR(-EINVAL);
				1081
				1082	/* Check receive queue parameters if no SRQ is specified. */
				1083	if (!init_attr->srq) {
				1084	if (init_attr->cap.max_recv_sge >
				1085	rdi->dparms.props.max_recv_sge \|\|
				1086	init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
				1087	return ERR_PTR(-EINVAL);
				1088
				1089	if (init_attr->cap.max_send_sge +
				1090	init_attr->cap.max_send_wr +
				1091	init_attr->cap.max_recv_sge +
				1092	init_attr->cap.max_recv_wr == 0)
				1093	return ERR_PTR(-EINVAL);
				1094	}
				1095	sqsize =
				1096	init_attr->cap.max_send_wr + 1 +
				1097	rdi->dparms.reserved_operations;
				1098	switch (init_attr->qp_type) {
				1099	case IB_QPT_SMI:
				1100	case IB_QPT_GSI:
				1101	if (init_attr->port_num == 0 \|\|
				1102	init_attr->port_num > ibpd->device->phys_port_cnt)
				1103	return ERR_PTR(-EINVAL);
				1104	/* fall through */
				1105	case IB_QPT_UC:
				1106	case IB_QPT_RC:
				1107	case IB_QPT_UD:
				1108	sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
				1109	swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
				1110	if (!swq)
				1111	return ERR_PTR(-ENOMEM);
				1112
				1113	sz = sizeof(*qp);
				1114	sg_list_sz = 0;
				1115	if (init_attr->srq) {
				1116	struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
				1117
				1118	if (srq->rq.max_sge > 1)
				1119	sg_list_sz = sizeof(qp->r_sg_list)
				1120	(srq->rq.max_sge - 1);
				1121	} else if (init_attr->cap.max_recv_sge > 1)
				1122	sg_list_sz = sizeof(qp->r_sg_list)
				1123	(init_attr->cap.max_recv_sge - 1);
				1124	qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL,
				1125	rdi->dparms.node);
				1126	if (!qp)
				1127	goto bail_swq;
				1128	qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
				1129
				1130	RCU_INIT_POINTER(qp->next, NULL);
				1131	if (init_attr->qp_type == IB_QPT_RC) {
				1132	qp->s_ack_queue =
				1133	kcalloc_node(rvt_max_atomic(rdi),
				1134	sizeof(*qp->s_ack_queue),
				1135	GFP_KERNEL,
				1136	rdi->dparms.node);
				1137	if (!qp->s_ack_queue)
				1138	goto bail_qp;
				1139	}
				1140	/* initialize timers needed for rc qp */
				1141	timer_setup(&qp->s_timer, rvt_rc_timeout, 0);
				1142	hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC,
				1143	HRTIMER_MODE_REL);
				1144	qp->s_rnr_timer.function = rvt_rc_rnr_retry;
				1145
				1146	/*
				1147	* Driver needs to set up it's private QP structure and do any
				1148	* initialization that is needed.
				1149	*/
				1150	priv = rdi->driver_f.qp_priv_alloc(rdi, qp);
				1151	if (IS_ERR(priv)) {
				1152	ret = priv;
				1153	goto bail_qp;
				1154	}
				1155	qp->priv = priv;
				1156	qp->timeout_jiffies =
				1157	usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
				1158	1000UL);
				1159	if (init_attr->srq) {
				1160	sz = 0;
				1161	} else {
				1162	qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
				1163	qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
				1164	sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
				1165	sizeof(struct rvt_rwqe);
				1166	err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
				1167	rdi->dparms.node, udata);
				1168	if (err) {
				1169	ret = ERR_PTR(err);
				1170	goto bail_driver_priv;
				1171	}
				1172	}
				1173
				1174	/*
				1175	* ib_create_qp() will initialize qp->ibqp
				1176	* except for qp->ibqp.qp_num.
				1177	*/
				1178	spin_lock_init(&qp->r_lock);
				1179	spin_lock_init(&qp->s_hlock);
				1180	spin_lock_init(&qp->s_lock);
				1181	atomic_set(&qp->refcount, 0);
				1182	atomic_set(&qp->local_ops_pending, 0);
				1183	init_waitqueue_head(&qp->wait);
				1184	INIT_LIST_HEAD(&qp->rspwait);
				1185	qp->state = IB_QPS_RESET;
				1186	qp->s_wq = swq;
				1187	qp->s_size = sqsize;
				1188	qp->s_avail = init_attr->cap.max_send_wr;
				1189	qp->s_max_sge = init_attr->cap.max_send_sge;
				1190	if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
				1191	qp->s_flags = RVT_S_SIGNAL_REQ_WR;
				1192	err = alloc_ud_wq_attr(qp, rdi->dparms.node);
				1193	if (err) {
				1194	ret = (ERR_PTR(err));
				1195	goto bail_rq_rvt;
				1196	}
				1197
				1198	err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
				1199	init_attr->qp_type,
				1200	init_attr->port_num);
				1201	if (err < 0) {
				1202	ret = ERR_PTR(err);
				1203	goto bail_rq_wq;
				1204	}
				1205	qp->ibqp.qp_num = err;
				1206	qp->port_num = init_attr->port_num;
				1207	rvt_init_qp(rdi, qp, init_attr->qp_type);
				1208	if (rdi->driver_f.qp_priv_init) {
				1209	err = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
				1210	if (err) {
				1211	ret = ERR_PTR(err);
				1212	goto bail_rq_wq;
				1213	}
				1214	}
				1215	break;
				1216
				1217	default:
				1218	/* Don't support raw QPs */
				1219	return ERR_PTR(-EINVAL);
				1220	}
				1221
				1222	init_attr->cap.max_inline_data = 0;
				1223
				1224	/*
				1225	* Return the address of the RWQ as the offset to mmap.
				1226	* See rvt_mmap() for details.
				1227	*/
				1228	if (udata && udata->outlen >= sizeof(__u64)) {
				1229	if (!qp->r_rq.wq) {
				1230	__u64 offset = 0;
				1231
				1232	err = ib_copy_to_udata(udata, &offset,
				1233	sizeof(offset));
				1234	if (err) {
				1235	ret = ERR_PTR(err);
				1236	goto bail_qpn;
				1237	}
				1238	} else {
				1239	u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
				1240
				1241	qp->ip = rvt_create_mmap_info(rdi, s, udata,
				1242	qp->r_rq.wq);
				1243	if (IS_ERR(qp->ip)) {
				1244	ret = ERR_CAST(qp->ip);
				1245	goto bail_qpn;
				1246	}
				1247
				1248	err = ib_copy_to_udata(udata, &qp->ip->offset,
				1249	sizeof(qp->ip->offset));
				1250	if (err) {
				1251	ret = ERR_PTR(err);
				1252	goto bail_ip;
				1253	}
				1254	}
				1255	qp->pid = current->pid;
				1256	}
				1257
				1258	spin_lock(&rdi->n_qps_lock);
				1259	if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
				1260	spin_unlock(&rdi->n_qps_lock);
				1261	ret = ERR_PTR(-ENOMEM);
				1262	goto bail_ip;
				1263	}
				1264
				1265	rdi->n_qps_allocated++;
				1266	/*
				1267	* Maintain a busy_jiffies variable that will be added to the timeout
				1268	* period in mod_retry_timer and add_retry_timer. This busy jiffies
				1269	* is scaled by the number of rc qps created for the device to reduce
				1270	* the number of timeouts occurring when there is a large number of
				1271	* qps. busy_jiffies is incremented every rc qp scaling interval.
				1272	* The scaling interval is selected based on extensive performance
				1273	* evaluation of targeted workloads.
				1274	*/
				1275	if (init_attr->qp_type == IB_QPT_RC) {
				1276	rdi->n_rc_qps++;
				1277	rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
				1278	}
				1279	spin_unlock(&rdi->n_qps_lock);
				1280
				1281	if (qp->ip) {
				1282	spin_lock_irq(&rdi->pending_lock);
				1283	list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
				1284	spin_unlock_irq(&rdi->pending_lock);
				1285	}
				1286
				1287	ret = &qp->ibqp;
				1288
				1289	return ret;
				1290
				1291	bail_ip:
				1292	if (qp->ip)
				1293	kref_put(&qp->ip->ref, rvt_release_mmap_info);
				1294
				1295	bail_qpn:
				1296	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
				1297
				1298	bail_rq_wq:
				1299	free_ud_wq_attr(qp);
				1300
				1301	bail_rq_rvt:
				1302	rvt_free_rq(&qp->r_rq);
				1303
				1304	bail_driver_priv:
				1305	rdi->driver_f.qp_priv_free(rdi, qp);
				1306
				1307	bail_qp:
				1308	kfree(qp->s_ack_queue);
				1309	kfree(qp);
				1310
				1311	bail_swq:
				1312	vfree(swq);
				1313
				1314	return ret;
				1315	}
				1316
				1317	/**
				1318	* rvt_error_qp - put a QP into the error state
				1319	* @qp: the QP to put into the error state
				1320	* @err: the receive completion error to signal if a RWQE is active
				1321	*
				1322	* Flushes both send and receive work queues.
				1323	*
				1324	* Return: true if last WQE event should be generated.
				1325	* The QP r_lock and s_lock should be held and interrupts disabled.
				1326	* If we are already in error state, just return.
				1327	*/
				1328	int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
				1329	{
				1330	struct ib_wc wc;
				1331	int ret = 0;
				1332	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				1333
				1334	lockdep_assert_held(&qp->r_lock);
				1335	lockdep_assert_held(&qp->s_lock);
				1336	if (qp->state == IB_QPS_ERR \|\| qp->state == IB_QPS_RESET)
				1337	goto bail;
				1338
				1339	qp->state = IB_QPS_ERR;
				1340
				1341	if (qp->s_flags & (RVT_S_TIMER \| RVT_S_WAIT_RNR)) {
				1342	qp->s_flags &= ~(RVT_S_TIMER \| RVT_S_WAIT_RNR);
				1343	del_timer(&qp->s_timer);
				1344	}
				1345
				1346	if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
				1347	qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
				1348
				1349	rdi->driver_f.notify_error_qp(qp);
				1350
				1351	/* Schedule the sending tasklet to drain the send work queue. */
				1352	if (READ_ONCE(qp->s_last) != qp->s_head)
				1353	rdi->driver_f.schedule_send(qp);
				1354
				1355	rvt_clear_mr_refs(qp, 0);
				1356
				1357	memset(&wc, 0, sizeof(wc));
				1358	wc.qp = &qp->ibqp;
				1359	wc.opcode = IB_WC_RECV;
				1360
				1361	if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
				1362	wc.wr_id = qp->r_wr_id;
				1363	wc.status = err;
				1364	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
				1365	}
				1366	wc.status = IB_WC_WR_FLUSH_ERR;
				1367
				1368	if (qp->r_rq.kwq) {
				1369	u32 head;
				1370	u32 tail;
				1371	struct rvt_rwq *wq = NULL;
				1372	struct rvt_krwq *kwq = NULL;
				1373
				1374	spin_lock(&qp->r_rq.kwq->c_lock);
				1375	/* qp->ip used to validate if there is a user buffer mmaped */
				1376	if (qp->ip) {
				1377	wq = qp->r_rq.wq;
				1378	head = RDMA_READ_UAPI_ATOMIC(wq->head);
				1379	tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
				1380	} else {
				1381	kwq = qp->r_rq.kwq;
				1382	head = kwq->head;
				1383	tail = kwq->tail;
				1384	}
				1385	/* sanity check pointers before trusting them */
				1386	if (head >= qp->r_rq.size)
				1387	head = 0;
				1388	if (tail >= qp->r_rq.size)
				1389	tail = 0;
				1390	while (tail != head) {
				1391	wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
				1392	if (++tail >= qp->r_rq.size)
				1393	tail = 0;
				1394	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
				1395	}
				1396	if (qp->ip)
				1397	RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
				1398	else
				1399	kwq->tail = tail;
				1400	spin_unlock(&qp->r_rq.kwq->c_lock);
				1401	} else if (qp->ibqp.event_handler) {
				1402	ret = 1;
				1403	}
				1404
				1405	bail:
				1406	return ret;
				1407	}
				1408	EXPORT_SYMBOL(rvt_error_qp);
				1409
				1410	/*
				1411	* Put the QP into the hash table.
				1412	* The hash table holds a reference to the QP.
				1413	*/
				1414	static void rvt_insert_qp(struct rvt_dev_info rdi, struct rvt_qp qp)
				1415	{
				1416	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
				1417	unsigned long flags;
				1418
				1419	rvt_get_qp(qp);
				1420	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
				1421
				1422	if (qp->ibqp.qp_num <= 1) {
				1423	rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
				1424	} else {
				1425	u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
				1426
				1427	qp->next = rdi->qp_dev->qp_table[n];
				1428	rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
				1429	trace_rvt_qpinsert(qp, n);
				1430	}
				1431
				1432	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
				1433	}
				1434
				1435	/**
				1436	* rvt_modify_qp - modify the attributes of a queue pair
				1437	* @ibqp: the queue pair who's attributes we're modifying
				1438	* @attr: the new attributes
				1439	* @attr_mask: the mask of attributes to modify
				1440	* @udata: user data for libibverbs.so
				1441	*
				1442	* Return: 0 on success, otherwise returns an errno.
				1443	*/
				1444	int rvt_modify_qp(struct ib_qp ibqp, struct ib_qp_attr attr,
				1445	int attr_mask, struct ib_udata *udata)
				1446	{
				1447	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
				1448	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
				1449	enum ib_qp_state cur_state, new_state;
				1450	struct ib_event ev;
				1451	int lastwqe = 0;
				1452	int mig = 0;
				1453	int pmtu = 0; /* for gcc warning only */
				1454	int opa_ah;
				1455
				1456	spin_lock_irq(&qp->r_lock);
				1457	spin_lock(&qp->s_hlock);
				1458	spin_lock(&qp->s_lock);
				1459
				1460	cur_state = attr_mask & IB_QP_CUR_STATE ?
				1461	attr->cur_qp_state : qp->state;
				1462	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
				1463	opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
				1464
				1465	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
				1466	attr_mask))
				1467	goto inval;
				1468
				1469	if (rdi->driver_f.check_modify_qp &&
				1470	rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
				1471	goto inval;
				1472
				1473	if (attr_mask & IB_QP_AV) {
				1474	if (opa_ah) {
				1475	if (rdma_ah_get_dlid(&attr->ah_attr) >=
				1476	opa_get_mcast_base(OPA_MCAST_NR))
				1477	goto inval;
				1478	} else {
				1479	if (rdma_ah_get_dlid(&attr->ah_attr) >=
				1480	be16_to_cpu(IB_MULTICAST_LID_BASE))
				1481	goto inval;
				1482	}
				1483
				1484	if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
				1485	goto inval;
				1486	}
				1487
				1488	if (attr_mask & IB_QP_ALT_PATH) {
				1489	if (opa_ah) {
				1490	if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
				1491	opa_get_mcast_base(OPA_MCAST_NR))
				1492	goto inval;
				1493	} else {
				1494	if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
				1495	be16_to_cpu(IB_MULTICAST_LID_BASE))
				1496	goto inval;
				1497	}
				1498
				1499	if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
				1500	goto inval;
				1501	if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
				1502	goto inval;
				1503	}
				1504
				1505	if (attr_mask & IB_QP_PKEY_INDEX)
				1506	if (attr->pkey_index >= rvt_get_npkeys(rdi))
				1507	goto inval;
				1508
				1509	if (attr_mask & IB_QP_MIN_RNR_TIMER)
				1510	if (attr->min_rnr_timer > 31)
				1511	goto inval;
				1512
				1513	if (attr_mask & IB_QP_PORT)
				1514	if (qp->ibqp.qp_type == IB_QPT_SMI \|\|
				1515	qp->ibqp.qp_type == IB_QPT_GSI \|\|
				1516	attr->port_num == 0 \|\|
				1517	attr->port_num > ibqp->device->phys_port_cnt)
				1518	goto inval;
				1519
				1520	if (attr_mask & IB_QP_DEST_QPN)
				1521	if (attr->dest_qp_num > RVT_QPN_MASK)
				1522	goto inval;
				1523
				1524	if (attr_mask & IB_QP_RETRY_CNT)
				1525	if (attr->retry_cnt > 7)
				1526	goto inval;
				1527
				1528	if (attr_mask & IB_QP_RNR_RETRY)
				1529	if (attr->rnr_retry > 7)
				1530	goto inval;
				1531
				1532	/*
				1533	* Don't allow invalid path_mtu values. OK to set greater
				1534	* than the active mtu (or even the max_cap, if we have tuned
				1535	* that to a small mtu. We'll set qp->path_mtu
				1536	* to the lesser of requested attribute mtu and active,
				1537	* for packetizing messages.
				1538	* Note that the QP port has to be set in INIT and MTU in RTR.
				1539	*/
				1540	if (attr_mask & IB_QP_PATH_MTU) {
				1541	pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
				1542	if (pmtu < 0)
				1543	goto inval;
				1544	}
				1545
				1546	if (attr_mask & IB_QP_PATH_MIG_STATE) {
				1547	if (attr->path_mig_state == IB_MIG_REARM) {
				1548	if (qp->s_mig_state == IB_MIG_ARMED)
				1549	goto inval;
				1550	if (new_state != IB_QPS_RTS)
				1551	goto inval;
				1552	} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
				1553	if (qp->s_mig_state == IB_MIG_REARM)
				1554	goto inval;
				1555	if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
				1556	goto inval;
				1557	if (qp->s_mig_state == IB_MIG_ARMED)
				1558	mig = 1;
				1559	} else {
				1560	goto inval;
				1561	}
				1562	}
				1563
				1564	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
				1565	if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
				1566	goto inval;
				1567
				1568	switch (new_state) {
				1569	case IB_QPS_RESET:
				1570	if (qp->state != IB_QPS_RESET)
				1571	_rvt_reset_qp(rdi, qp, ibqp->qp_type);
				1572	break;
				1573
				1574	case IB_QPS_RTR:
				1575	/* Allow event to re-trigger if QP set to RTR more than once */
				1576	qp->r_flags &= ~RVT_R_COMM_EST;
				1577	qp->state = new_state;
				1578	break;
				1579
				1580	case IB_QPS_SQD:
				1581	qp->s_draining = qp->s_last != qp->s_cur;
				1582	qp->state = new_state;
				1583	break;
				1584
				1585	case IB_QPS_SQE:
				1586	if (qp->ibqp.qp_type == IB_QPT_RC)
				1587	goto inval;
				1588	qp->state = new_state;
				1589	break;
				1590
				1591	case IB_QPS_ERR:
				1592	lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
				1593	break;
				1594
				1595	default:
				1596	qp->state = new_state;
				1597	break;
				1598	}
				1599
				1600	if (attr_mask & IB_QP_PKEY_INDEX)
				1601	qp->s_pkey_index = attr->pkey_index;
				1602
				1603	if (attr_mask & IB_QP_PORT)
				1604	qp->port_num = attr->port_num;
				1605
				1606	if (attr_mask & IB_QP_DEST_QPN)
				1607	qp->remote_qpn = attr->dest_qp_num;
				1608
				1609	if (attr_mask & IB_QP_SQ_PSN) {
				1610	qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
				1611	qp->s_psn = qp->s_next_psn;
				1612	qp->s_sending_psn = qp->s_next_psn;
				1613	qp->s_last_psn = qp->s_next_psn - 1;
				1614	qp->s_sending_hpsn = qp->s_last_psn;
				1615	}
				1616
				1617	if (attr_mask & IB_QP_RQ_PSN)
				1618	qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
				1619
				1620	if (attr_mask & IB_QP_ACCESS_FLAGS)
				1621	qp->qp_access_flags = attr->qp_access_flags;
				1622
				1623	if (attr_mask & IB_QP_AV) {
				1624	rdma_replace_ah_attr(&qp->remote_ah_attr, &attr->ah_attr);
				1625	qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr);
				1626	qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
				1627	}
				1628
				1629	if (attr_mask & IB_QP_ALT_PATH) {
				1630	rdma_replace_ah_attr(&qp->alt_ah_attr, &attr->alt_ah_attr);
				1631	qp->s_alt_pkey_index = attr->alt_pkey_index;
				1632	}
				1633
				1634	if (attr_mask & IB_QP_PATH_MIG_STATE) {
				1635	qp->s_mig_state = attr->path_mig_state;
				1636	if (mig) {
				1637	qp->remote_ah_attr = qp->alt_ah_attr;
				1638	qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
				1639	qp->s_pkey_index = qp->s_alt_pkey_index;
				1640	}
				1641	}
				1642
				1643	if (attr_mask & IB_QP_PATH_MTU) {
				1644	qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
				1645	qp->log_pmtu = ilog2(qp->pmtu);
				1646	}
				1647
				1648	if (attr_mask & IB_QP_RETRY_CNT) {
				1649	qp->s_retry_cnt = attr->retry_cnt;
				1650	qp->s_retry = attr->retry_cnt;
				1651	}
				1652
				1653	if (attr_mask & IB_QP_RNR_RETRY) {
				1654	qp->s_rnr_retry_cnt = attr->rnr_retry;
				1655	qp->s_rnr_retry = attr->rnr_retry;
				1656	}
				1657
				1658	if (attr_mask & IB_QP_MIN_RNR_TIMER)
				1659	qp->r_min_rnr_timer = attr->min_rnr_timer;
				1660
				1661	if (attr_mask & IB_QP_TIMEOUT) {
				1662	qp->timeout = attr->timeout;
				1663	qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout);
				1664	}
				1665
				1666	if (attr_mask & IB_QP_QKEY)
				1667	qp->qkey = attr->qkey;
				1668
				1669	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
				1670	qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
				1671
				1672	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
				1673	qp->s_max_rd_atomic = attr->max_rd_atomic;
				1674
				1675	if (rdi->driver_f.modify_qp)
				1676	rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
				1677
				1678	spin_unlock(&qp->s_lock);
				1679	spin_unlock(&qp->s_hlock);
				1680	spin_unlock_irq(&qp->r_lock);
				1681
				1682	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
				1683	rvt_insert_qp(rdi, qp);
				1684
				1685	if (lastwqe) {
				1686	ev.device = qp->ibqp.device;
				1687	ev.element.qp = &qp->ibqp;
				1688	ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
				1689	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
				1690	}
				1691	if (mig) {
				1692	ev.device = qp->ibqp.device;
				1693	ev.element.qp = &qp->ibqp;
				1694	ev.event = IB_EVENT_PATH_MIG;
				1695	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
				1696	}
				1697	return 0;
				1698
				1699	inval:
				1700	spin_unlock(&qp->s_lock);
				1701	spin_unlock(&qp->s_hlock);
				1702	spin_unlock_irq(&qp->r_lock);
				1703	return -EINVAL;
				1704	}
				1705
				1706	/**
				1707	* rvt_destroy_qp - destroy a queue pair
				1708	* @ibqp: the queue pair to destroy
				1709	*
				1710	* Note that this can be called while the QP is actively sending or
				1711	* receiving!
				1712	*
				1713	* Return: 0 on success.
				1714	*/
				1715	int rvt_destroy_qp(struct ib_qp ibqp, struct ib_udata udata)
				1716	{
				1717	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
				1718	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
				1719
				1720	rvt_reset_qp(rdi, qp, ibqp->qp_type);
				1721
				1722	wait_event(qp->wait, !atomic_read(&qp->refcount));
				1723	/* qpn is now available for use again */
				1724	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
				1725
				1726	spin_lock(&rdi->n_qps_lock);
				1727	rdi->n_qps_allocated--;
				1728	if (qp->ibqp.qp_type == IB_QPT_RC) {
				1729	rdi->n_rc_qps--;
				1730	rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
				1731	}
				1732	spin_unlock(&rdi->n_qps_lock);
				1733
				1734	if (qp->ip)
				1735	kref_put(&qp->ip->ref, rvt_release_mmap_info);
				1736	kvfree(qp->r_rq.kwq);
				1737	rdi->driver_f.qp_priv_free(rdi, qp);
				1738	kfree(qp->s_ack_queue);
				1739	rdma_destroy_ah_attr(&qp->remote_ah_attr);
				1740	rdma_destroy_ah_attr(&qp->alt_ah_attr);
				1741	free_ud_wq_attr(qp);
				1742	vfree(qp->s_wq);
				1743	kfree(qp);
				1744	return 0;
				1745	}
				1746
				1747	/**
				1748	* rvt_query_qp - query an ipbq
				1749	* @ibqp: IB qp to query
				1750	* @attr: attr struct to fill in
				1751	* @attr_mask: attr mask ignored
				1752	* @init_attr: struct to fill in
				1753	*
				1754	* Return: always 0
				1755	*/
				1756	int rvt_query_qp(struct ib_qp ibqp, struct ib_qp_attr attr,
				1757	int attr_mask, struct ib_qp_init_attr *init_attr)
				1758	{
				1759	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
				1760	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
				1761
				1762	attr->qp_state = qp->state;
				1763	attr->cur_qp_state = attr->qp_state;
				1764	attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
				1765	attr->path_mig_state = qp->s_mig_state;
				1766	attr->qkey = qp->qkey;
				1767	attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
				1768	attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
				1769	attr->dest_qp_num = qp->remote_qpn;
				1770	attr->qp_access_flags = qp->qp_access_flags;
				1771	attr->cap.max_send_wr = qp->s_size - 1 -
				1772	rdi->dparms.reserved_operations;
				1773	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
				1774	attr->cap.max_send_sge = qp->s_max_sge;
				1775	attr->cap.max_recv_sge = qp->r_rq.max_sge;
				1776	attr->cap.max_inline_data = 0;
				1777	attr->ah_attr = qp->remote_ah_attr;
				1778	attr->alt_ah_attr = qp->alt_ah_attr;
				1779	attr->pkey_index = qp->s_pkey_index;
				1780	attr->alt_pkey_index = qp->s_alt_pkey_index;
				1781	attr->en_sqd_async_notify = 0;
				1782	attr->sq_draining = qp->s_draining;
				1783	attr->max_rd_atomic = qp->s_max_rd_atomic;
				1784	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
				1785	attr->min_rnr_timer = qp->r_min_rnr_timer;
				1786	attr->port_num = qp->port_num;
				1787	attr->timeout = qp->timeout;
				1788	attr->retry_cnt = qp->s_retry_cnt;
				1789	attr->rnr_retry = qp->s_rnr_retry_cnt;
				1790	attr->alt_port_num =
				1791	rdma_ah_get_port_num(&qp->alt_ah_attr);
				1792	attr->alt_timeout = qp->alt_timeout;
				1793
				1794	init_attr->event_handler = qp->ibqp.event_handler;
				1795	init_attr->qp_context = qp->ibqp.qp_context;
				1796	init_attr->send_cq = qp->ibqp.send_cq;
				1797	init_attr->recv_cq = qp->ibqp.recv_cq;
				1798	init_attr->srq = qp->ibqp.srq;
				1799	init_attr->cap = attr->cap;
				1800	if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
				1801	init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
				1802	else
				1803	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
				1804	init_attr->qp_type = qp->ibqp.qp_type;
				1805	init_attr->port_num = qp->port_num;
				1806	return 0;
				1807	}
				1808
				1809	/**
				1810	* rvt_post_receive - post a receive on a QP
				1811	* @ibqp: the QP to post the receive on
				1812	* @wr: the WR to post
				1813	* @bad_wr: the first bad WR is put here
				1814	*
				1815	* This may be called from interrupt context.
				1816	*
				1817	* Return: 0 on success otherwise errno
				1818	*/
				1819	int rvt_post_recv(struct ib_qp ibqp, const struct ib_recv_wr wr,
				1820	const struct ib_recv_wr **bad_wr)
				1821	{
				1822	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
				1823	struct rvt_krwq *wq = qp->r_rq.kwq;
				1824	unsigned long flags;
				1825	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
				1826	!qp->ibqp.srq;
				1827
				1828	/* Check that state is OK to post receive. */
				1829	if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) \|\| !wq) {
				1830	*bad_wr = wr;
				1831	return -EINVAL;
				1832	}
				1833
				1834	for (; wr; wr = wr->next) {
				1835	struct rvt_rwqe *wqe;
				1836	u32 next;
				1837	int i;
				1838
				1839	if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
				1840	*bad_wr = wr;
				1841	return -EINVAL;
				1842	}
				1843
				1844	spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
				1845	next = wq->head + 1;
				1846	if (next >= qp->r_rq.size)
				1847	next = 0;
				1848	if (next == READ_ONCE(wq->tail)) {
				1849	spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
				1850	*bad_wr = wr;
				1851	return -ENOMEM;
				1852	}
				1853	if (unlikely(qp_err_flush)) {
				1854	struct ib_wc wc;
				1855
				1856	memset(&wc, 0, sizeof(wc));
				1857	wc.qp = &qp->ibqp;
				1858	wc.opcode = IB_WC_RECV;
				1859	wc.wr_id = wr->wr_id;
				1860	wc.status = IB_WC_WR_FLUSH_ERR;
				1861	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
				1862	} else {
				1863	wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
				1864	wqe->wr_id = wr->wr_id;
				1865	wqe->num_sge = wr->num_sge;
				1866	for (i = 0; i < wr->num_sge; i++) {
				1867	wqe->sg_list[i].addr = wr->sg_list[i].addr;
				1868	wqe->sg_list[i].length = wr->sg_list[i].length;
				1869	wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
				1870	}
				1871	/*
				1872	* Make sure queue entry is written
				1873	* before the head index.
				1874	*/
				1875	smp_store_release(&wq->head, next);
				1876	}
				1877	spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
				1878	}
				1879	return 0;
				1880	}
				1881
				1882	/**
				1883	* rvt_qp_valid_operation - validate post send wr request
				1884	* @qp - the qp
				1885	* @post-parms - the post send table for the driver
				1886	* @wr - the work request
				1887	*
				1888	* The routine validates the operation based on the
				1889	* validation table an returns the length of the operation
				1890	* which can extend beyond the ib_send_bw. Operation
				1891	* dependent flags key atomic operation validation.
				1892	*
				1893	* There is an exception for UD qps that validates the pd and
				1894	* overrides the length to include the additional UD specific
				1895	* length.
				1896	*
				1897	* Returns a negative error or the length of the work request
				1898	* for building the swqe.
				1899	*/
				1900	static inline int rvt_qp_valid_operation(
				1901	struct rvt_qp *qp,
				1902	const struct rvt_operation_params *post_parms,
				1903	const struct ib_send_wr *wr)
				1904	{
				1905	int len;
				1906
				1907	if (wr->opcode >= RVT_OPERATION_MAX \|\| !post_parms[wr->opcode].length)
				1908	return -EINVAL;
				1909	if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
				1910	return -EINVAL;
				1911	if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
				1912	ibpd_to_rvtpd(qp->ibqp.pd)->user)
				1913	return -EINVAL;
				1914	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
				1915	(wr->num_sge == 0 \|\|
				1916	wr->sg_list[0].length < sizeof(u64) \|\|
				1917	wr->sg_list[0].addr & (sizeof(u64) - 1)))
				1918	return -EINVAL;
				1919	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
				1920	!qp->s_max_rd_atomic)
				1921	return -EINVAL;
				1922	len = post_parms[wr->opcode].length;
				1923	/* UD specific */
				1924	if (qp->ibqp.qp_type != IB_QPT_UC &&
				1925	qp->ibqp.qp_type != IB_QPT_RC) {
				1926	if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
				1927	return -EINVAL;
				1928	len = sizeof(struct ib_ud_wr);
				1929	}
				1930	return len;
				1931	}
				1932
				1933	/**
				1934	* rvt_qp_is_avail - determine queue capacity
				1935	* @qp: the qp
				1936	* @rdi: the rdmavt device
				1937	* @reserved_op: is reserved operation
				1938	*
				1939	* This assumes the s_hlock is held but the s_last
				1940	* qp variable is uncontrolled.
				1941	*
				1942	* For non reserved operations, the qp->s_avail
				1943	* may be changed.
				1944	*
				1945	* The return value is zero or a -ENOMEM.
				1946	*/
				1947	static inline int rvt_qp_is_avail(
				1948	struct rvt_qp *qp,
				1949	struct rvt_dev_info *rdi,
				1950	bool reserved_op)
				1951	{
				1952	u32 slast;
				1953	u32 avail;
				1954	u32 reserved_used;
				1955
				1956	/* see rvt_qp_wqe_unreserve() */
				1957	smp_mb__before_atomic();
				1958	if (unlikely(reserved_op)) {
				1959	/* see rvt_qp_wqe_unreserve() */
				1960	reserved_used = atomic_read(&qp->s_reserved_used);
				1961	if (reserved_used >= rdi->dparms.reserved_operations)
				1962	return -ENOMEM;
				1963	return 0;
				1964	}
				1965	/* non-reserved operations */
				1966	if (likely(qp->s_avail))
				1967	return 0;
				1968	/* See rvt_qp_complete_swqe() */
				1969	slast = smp_load_acquire(&qp->s_last);
				1970	if (qp->s_head >= slast)
				1971	avail = qp->s_size - (qp->s_head - slast);
				1972	else
				1973	avail = slast - qp->s_head;
				1974
				1975	reserved_used = atomic_read(&qp->s_reserved_used);
				1976	avail = avail - 1 -
				1977	(rdi->dparms.reserved_operations - reserved_used);
				1978	/* insure we don't assign a negative s_avail */
				1979	if ((s32)avail <= 0)
				1980	return -ENOMEM;
				1981	qp->s_avail = avail;
				1982	if (WARN_ON(qp->s_avail >
				1983	(qp->s_size - 1 - rdi->dparms.reserved_operations)))
				1984	rvt_pr_err(rdi,
				1985	"More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
				1986	qp->ibqp.qp_num, qp->s_size, qp->s_avail,
				1987	qp->s_head, qp->s_tail, qp->s_cur,
				1988	qp->s_acked, qp->s_last);
				1989	return 0;
				1990	}
				1991
				1992	/**
				1993	* rvt_post_one_wr - post one RC, UC, or UD send work request
				1994	* @qp: the QP to post on
				1995	* @wr: the work request to send
				1996	*/
				1997	static int rvt_post_one_wr(struct rvt_qp *qp,
				1998	const struct ib_send_wr *wr,
				1999	bool *call_send)
				2000	{
				2001	struct rvt_swqe *wqe;
				2002	u32 next;
				2003	int i;
				2004	int j;
				2005	int acc;
				2006	struct rvt_lkey_table *rkt;
				2007	struct rvt_pd *pd;
				2008	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				2009	u8 log_pmtu;
				2010	int ret;
				2011	size_t cplen;
				2012	bool reserved_op;
				2013	int local_ops_delayed = 0;
				2014
				2015	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
				2016
				2017	/* IB spec says that num_sge == 0 is OK. */
				2018	if (unlikely(wr->num_sge > qp->s_max_sge))
				2019	return -EINVAL;
				2020
				2021	ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
				2022	if (ret < 0)
				2023	return ret;
				2024	cplen = ret;
				2025
				2026	/*
				2027	* Local operations include fast register and local invalidate.
				2028	* Fast register needs to be processed immediately because the
				2029	* registered lkey may be used by following work requests and the
				2030	* lkey needs to be valid at the time those requests are posted.
				2031	* Local invalidate can be processed immediately if fencing is
				2032	* not required and no previous local invalidate ops are pending.
				2033	* Signaled local operations that have been processed immediately
				2034	* need to have requests with "completion only" flags set posted
				2035	* to the send queue in order to generate completions.
				2036	*/
				2037	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
				2038	switch (wr->opcode) {
				2039	case IB_WR_REG_MR:
				2040	ret = rvt_fast_reg_mr(qp,
				2041	reg_wr(wr)->mr,
				2042	reg_wr(wr)->key,
				2043	reg_wr(wr)->access);
				2044	if (ret \|\| !(wr->send_flags & IB_SEND_SIGNALED))
				2045	return ret;
				2046	break;
				2047	case IB_WR_LOCAL_INV:
				2048	if ((wr->send_flags & IB_SEND_FENCE) \|\|
				2049	atomic_read(&qp->local_ops_pending)) {
				2050	local_ops_delayed = 1;
				2051	} else {
				2052	ret = rvt_invalidate_rkey(
				2053	qp, wr->ex.invalidate_rkey);
				2054	if (ret \|\| !(wr->send_flags & IB_SEND_SIGNALED))
				2055	return ret;
				2056	}
				2057	break;
				2058	default:
				2059	return -EINVAL;
				2060	}
				2061	}
				2062
				2063	reserved_op = rdi->post_parms[wr->opcode].flags &
				2064	RVT_OPERATION_USE_RESERVE;
				2065	/* check for avail */
				2066	ret = rvt_qp_is_avail(qp, rdi, reserved_op);
				2067	if (ret)
				2068	return ret;
				2069	next = qp->s_head + 1;
				2070	if (next >= qp->s_size)
				2071	next = 0;
				2072
				2073	rkt = &rdi->lkey_table;
				2074	pd = ibpd_to_rvtpd(qp->ibqp.pd);
				2075	wqe = rvt_get_swqe_ptr(qp, qp->s_head);
				2076
				2077	/* cplen has length from above */
				2078	memcpy(&wqe->wr, wr, cplen);
				2079
				2080	wqe->length = 0;
				2081	j = 0;
				2082	if (wr->num_sge) {
				2083	struct rvt_sge *last_sge = NULL;
				2084
				2085	acc = wr->opcode >= IB_WR_RDMA_READ ?
				2086	IB_ACCESS_LOCAL_WRITE : 0;
				2087	for (i = 0; i < wr->num_sge; i++) {
				2088	u32 length = wr->sg_list[i].length;
				2089
				2090	if (length == 0)
				2091	continue;
				2092	ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
				2093	&wr->sg_list[i], acc);
				2094	if (unlikely(ret < 0))
				2095	goto bail_inval_free;
				2096	wqe->length += length;
				2097	if (ret)
				2098	last_sge = &wqe->sg_list[j];
				2099	j += ret;
				2100	}
				2101	wqe->wr.num_sge = j;
				2102	}
				2103
				2104	/*
				2105	* Calculate and set SWQE PSN values prior to handing it off
				2106	* to the driver's check routine. This give the driver the
				2107	* opportunity to adjust PSN values based on internal checks.
				2108	*/
				2109	log_pmtu = qp->log_pmtu;
				2110	if (qp->allowed_ops == IB_OPCODE_UD) {
				2111	struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
				2112
				2113	log_pmtu = ah->log_pmtu;
				2114	rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
				2115	}
				2116
				2117	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
				2118	if (local_ops_delayed)
				2119	atomic_inc(&qp->local_ops_pending);
				2120	else
				2121	wqe->wr.send_flags \|= RVT_SEND_COMPLETION_ONLY;
				2122	wqe->ssn = 0;
				2123	wqe->psn = 0;
				2124	wqe->lpsn = 0;
				2125	} else {
				2126	wqe->ssn = qp->s_ssn++;
				2127	wqe->psn = qp->s_next_psn;
				2128	wqe->lpsn = wqe->psn +
				2129	(wqe->length ?
				2130	((wqe->length - 1) >> log_pmtu) :
				2131	0);
				2132	}
				2133
				2134	/* general part of wqe valid - allow for driver checks */
				2135	if (rdi->driver_f.setup_wqe) {
				2136	ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
				2137	if (ret < 0)
				2138	goto bail_inval_free_ref;
				2139	}
				2140
				2141	if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
				2142	qp->s_next_psn = wqe->lpsn + 1;
				2143
				2144	if (unlikely(reserved_op)) {
				2145	wqe->wr.send_flags \|= RVT_SEND_RESERVE_USED;
				2146	rvt_qp_wqe_reserve(qp, wqe);
				2147	} else {
				2148	wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
				2149	qp->s_avail--;
				2150	}
				2151	trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
				2152	smp_wmb(); /* see request builders */
				2153	qp->s_head = next;
				2154
				2155	return 0;
				2156
				2157	bail_inval_free_ref:
				2158	if (qp->allowed_ops == IB_OPCODE_UD)
				2159	rdma_destroy_ah_attr(wqe->ud_wr.attr);
				2160	bail_inval_free:
				2161	/* release mr holds */
				2162	while (j) {
				2163	struct rvt_sge *sge = &wqe->sg_list[--j];
				2164
				2165	rvt_put_mr(sge->mr);
				2166	}
				2167	return ret;
				2168	}
				2169
				2170	/**
				2171	* rvt_post_send - post a send on a QP
				2172	* @ibqp: the QP to post the send on
				2173	* @wr: the list of work requests to post
				2174	* @bad_wr: the first bad WR is put here
				2175	*
				2176	* This may be called from interrupt context.
				2177	*
				2178	* Return: 0 on success else errno
				2179	*/
				2180	int rvt_post_send(struct ib_qp ibqp, const struct ib_send_wr wr,
				2181	const struct ib_send_wr **bad_wr)
				2182	{
				2183	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
				2184	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
				2185	unsigned long flags = 0;
				2186	bool call_send;
				2187	unsigned nreq = 0;
				2188	int err = 0;
				2189
				2190	spin_lock_irqsave(&qp->s_hlock, flags);
				2191
				2192	/*
				2193	* Ensure QP state is such that we can send. If not bail out early,
				2194	* there is no need to do this every time we post a send.
				2195	*/
				2196	if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
				2197	spin_unlock_irqrestore(&qp->s_hlock, flags);
				2198	return -EINVAL;
				2199	}
				2200
				2201	/*
				2202	* If the send queue is empty, and we only have a single WR then just go
				2203	* ahead and kick the send engine into gear. Otherwise we will always
				2204	* just schedule the send to happen later.
				2205	*/
				2206	call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
				2207
				2208	for (; wr; wr = wr->next) {
				2209	err = rvt_post_one_wr(qp, wr, &call_send);
				2210	if (unlikely(err)) {
				2211	*bad_wr = wr;
				2212	goto bail;
				2213	}
				2214	nreq++;
				2215	}
				2216	bail:
				2217	spin_unlock_irqrestore(&qp->s_hlock, flags);
				2218	if (nreq) {
				2219	/*
				2220	* Only call do_send if there is exactly one packet, and the
				2221	* driver said it was ok.
				2222	*/
				2223	if (nreq == 1 && call_send)
				2224	rdi->driver_f.do_send(qp);
				2225	else
				2226	rdi->driver_f.schedule_send_no_lock(qp);
				2227	}
				2228	return err;
				2229	}
				2230
				2231	/**
				2232	* rvt_post_srq_receive - post a receive on a shared receive queue
				2233	* @ibsrq: the SRQ to post the receive on
				2234	* @wr: the list of work requests to post
				2235	* @bad_wr: A pointer to the first WR to cause a problem is put here
				2236	*
				2237	* This may be called from interrupt context.
				2238	*
				2239	* Return: 0 on success else errno
				2240	*/
				2241	int rvt_post_srq_recv(struct ib_srq ibsrq, const struct ib_recv_wr wr,
				2242	const struct ib_recv_wr **bad_wr)
				2243	{
				2244	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
				2245	struct rvt_krwq *wq;
				2246	unsigned long flags;
				2247
				2248	for (; wr; wr = wr->next) {
				2249	struct rvt_rwqe *wqe;
				2250	u32 next;
				2251	int i;
				2252
				2253	if ((unsigned)wr->num_sge > srq->rq.max_sge) {
				2254	*bad_wr = wr;
				2255	return -EINVAL;
				2256	}
				2257
				2258	spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
				2259	wq = srq->rq.kwq;
				2260	next = wq->head + 1;
				2261	if (next >= srq->rq.size)
				2262	next = 0;
				2263	if (next == READ_ONCE(wq->tail)) {
				2264	spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
				2265	*bad_wr = wr;
				2266	return -ENOMEM;
				2267	}
				2268
				2269	wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
				2270	wqe->wr_id = wr->wr_id;
				2271	wqe->num_sge = wr->num_sge;
				2272	for (i = 0; i < wr->num_sge; i++) {
				2273	wqe->sg_list[i].addr = wr->sg_list[i].addr;
				2274	wqe->sg_list[i].length = wr->sg_list[i].length;
				2275	wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
				2276	}
				2277	/* Make sure queue entry is written before the head index. */
				2278	smp_store_release(&wq->head, next);
				2279	spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
				2280	}
				2281	return 0;
				2282	}
				2283
				2284	/*
				2285	* rvt used the internal kernel struct as part of its ABI, for now make sure
				2286	* the kernel struct does not change layout. FIXME: rvt should never cast the
				2287	* user struct to a kernel struct.
				2288	*/
				2289	static struct ib_sge rvt_cast_sge(struct rvt_wqe_sge sge)
				2290	{
				2291	BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
				2292	offsetof(struct rvt_wqe_sge, addr));
				2293	BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
				2294	offsetof(struct rvt_wqe_sge, length));
				2295	BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
				2296	offsetof(struct rvt_wqe_sge, lkey));
				2297	return (struct ib_sge *)sge;
				2298	}
				2299
				2300	/*
				2301	* Validate a RWQE and fill in the SGE state.
				2302	* Return 1 if OK.
				2303	*/
				2304	static int init_sge(struct rvt_qp qp, struct rvt_rwqe wqe)
				2305	{
				2306	int i, j, ret;
				2307	struct ib_wc wc;
				2308	struct rvt_lkey_table *rkt;
				2309	struct rvt_pd *pd;
				2310	struct rvt_sge_state *ss;
				2311	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				2312
				2313	rkt = &rdi->lkey_table;
				2314	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
				2315	ss = &qp->r_sge;
				2316	ss->sg_list = qp->r_sg_list;
				2317	qp->r_len = 0;
				2318	for (i = j = 0; i < wqe->num_sge; i++) {
				2319	if (wqe->sg_list[i].length == 0)
				2320	continue;
				2321	/* Check LKEY */
				2322	ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
				2323	NULL, rvt_cast_sge(&wqe->sg_list[i]),
				2324	IB_ACCESS_LOCAL_WRITE);
				2325	if (unlikely(ret <= 0))
				2326	goto bad_lkey;
				2327	qp->r_len += wqe->sg_list[i].length;
				2328	j++;
				2329	}
				2330	ss->num_sge = j;
				2331	ss->total_len = qp->r_len;
				2332	return 1;
				2333
				2334	bad_lkey:
				2335	while (j) {
				2336	struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
				2337
				2338	rvt_put_mr(sge->mr);
				2339	}
				2340	ss->num_sge = 0;
				2341	memset(&wc, 0, sizeof(wc));
				2342	wc.wr_id = wqe->wr_id;
				2343	wc.status = IB_WC_LOC_PROT_ERR;
				2344	wc.opcode = IB_WC_RECV;
				2345	wc.qp = &qp->ibqp;
				2346	/* Signal solicited completion event. */
				2347	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
				2348	return 0;
				2349	}
				2350
				2351	/**
				2352	* get_rvt_head - get head indices of the circular buffer
				2353	* @rq: data structure for request queue entry
				2354	* @ip: the QP
				2355	*
				2356	* Return - head index value
				2357	*/
				2358	static inline u32 get_rvt_head(struct rvt_rq rq, void ip)
				2359	{
				2360	u32 head;
				2361
				2362	if (ip)
				2363	head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
				2364	else
				2365	head = rq->kwq->head;
				2366
				2367	return head;
				2368	}
				2369
				2370	/**
				2371	* rvt_get_rwqe - copy the next RWQE into the QP's RWQE
				2372	* @qp: the QP
				2373	* @wr_id_only: update qp->r_wr_id only, not qp->r_sge
				2374	*
				2375	* Return -1 if there is a local error, 0 if no RWQE is available,
				2376	* otherwise return 1.
				2377	*
				2378	* Can be called from interrupt level.
				2379	*/
				2380	int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
				2381	{
				2382	unsigned long flags;
				2383	struct rvt_rq *rq;
				2384	struct rvt_krwq *kwq = NULL;
				2385	struct rvt_rwq *wq;
				2386	struct rvt_srq *srq;
				2387	struct rvt_rwqe *wqe;
				2388	void (handler)(struct ib_event , void *);
				2389	u32 tail;
				2390	u32 head;
				2391	int ret;
				2392	void *ip = NULL;
				2393
				2394	if (qp->ibqp.srq) {
				2395	srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
				2396	handler = srq->ibsrq.event_handler;
				2397	rq = &srq->rq;
				2398	ip = srq->ip;
				2399	} else {
				2400	srq = NULL;
				2401	handler = NULL;
				2402	rq = &qp->r_rq;
				2403	ip = qp->ip;
				2404	}
				2405
				2406	spin_lock_irqsave(&rq->kwq->c_lock, flags);
				2407	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
				2408	ret = 0;
				2409	goto unlock;
				2410	}
				2411	kwq = rq->kwq;
				2412	if (ip) {
				2413	wq = rq->wq;
				2414	tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
				2415	} else {
				2416	tail = kwq->tail;
				2417	}
				2418
				2419	/* Validate tail before using it since it is user writable. */
				2420	if (tail >= rq->size)
				2421	tail = 0;
				2422
				2423	if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
				2424	head = get_rvt_head(rq, ip);
				2425	kwq->count = rvt_get_rq_count(rq, head, tail);
				2426	}
				2427	if (unlikely(kwq->count == 0)) {
				2428	ret = 0;
				2429	goto unlock;
				2430	}
				2431	/* Make sure entry is read after the count is read. */
				2432	smp_rmb();
				2433	wqe = rvt_get_rwqe_ptr(rq, tail);
				2434	/*
				2435	* Even though we update the tail index in memory, the verbs
				2436	* consumer is not supposed to post more entries until a
				2437	* completion is generated.
				2438	*/
				2439	if (++tail >= rq->size)
				2440	tail = 0;
				2441	if (ip)
				2442	RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
				2443	else
				2444	kwq->tail = tail;
				2445	if (!wr_id_only && !init_sge(qp, wqe)) {
				2446	ret = -1;
				2447	goto unlock;
				2448	}
				2449	qp->r_wr_id = wqe->wr_id;
				2450
				2451	kwq->count--;
				2452	ret = 1;
				2453	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
				2454	if (handler) {
				2455	/*
				2456	* Validate head pointer value and compute
				2457	* the number of remaining WQEs.
				2458	*/
				2459	if (kwq->count < srq->limit) {
				2460	kwq->count =
				2461	rvt_get_rq_count(rq,
				2462	get_rvt_head(rq, ip), tail);
				2463	if (kwq->count < srq->limit) {
				2464	struct ib_event ev;
				2465
				2466	srq->limit = 0;
				2467	spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
				2468	ev.device = qp->ibqp.device;
				2469	ev.element.srq = qp->ibqp.srq;
				2470	ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
				2471	handler(&ev, srq->ibsrq.srq_context);
				2472	goto bail;
				2473	}
				2474	}
				2475	}
				2476	unlock:
				2477	spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
				2478	bail:
				2479	return ret;
				2480	}
				2481	EXPORT_SYMBOL(rvt_get_rwqe);
				2482
				2483	/**
				2484	* qp_comm_est - handle trap with QP established
				2485	* @qp: the QP
				2486	*/
				2487	void rvt_comm_est(struct rvt_qp *qp)
				2488	{
				2489	qp->r_flags \|= RVT_R_COMM_EST;
				2490	if (qp->ibqp.event_handler) {
				2491	struct ib_event ev;
				2492
				2493	ev.device = qp->ibqp.device;
				2494	ev.element.qp = &qp->ibqp;
				2495	ev.event = IB_EVENT_COMM_EST;
				2496	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
				2497	}
				2498	}
				2499	EXPORT_SYMBOL(rvt_comm_est);
				2500
				2501	void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
				2502	{
				2503	unsigned long flags;
				2504	int lastwqe;
				2505
				2506	spin_lock_irqsave(&qp->s_lock, flags);
				2507	lastwqe = rvt_error_qp(qp, err);
				2508	spin_unlock_irqrestore(&qp->s_lock, flags);
				2509
				2510	if (lastwqe) {
				2511	struct ib_event ev;
				2512
				2513	ev.device = qp->ibqp.device;
				2514	ev.element.qp = &qp->ibqp;
				2515	ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
				2516	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
				2517	}
				2518	}
				2519	EXPORT_SYMBOL(rvt_rc_error);
				2520
				2521	/*
				2522	* rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
				2523	* @index - the index
				2524	* return usec from an index into ib_rvt_rnr_table
				2525	*/
				2526	unsigned long rvt_rnr_tbl_to_usec(u32 index)
				2527	{
				2528	return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
				2529	}
				2530	EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
				2531
				2532	static inline unsigned long rvt_aeth_to_usec(u32 aeth)
				2533	{
				2534	return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
				2535	IB_AETH_CREDIT_MASK];
				2536	}
				2537
				2538	/*
				2539	* rvt_add_retry_timer_ext - add/start a retry timer
				2540	* @qp - the QP
				2541	* @shift - timeout shift to wait for multiple packets
				2542	* add a retry timer on the QP
				2543	*/
				2544	void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
				2545	{
				2546	struct ib_qp *ibqp = &qp->ibqp;
				2547	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
				2548
				2549	lockdep_assert_held(&qp->s_lock);
				2550	qp->s_flags \|= RVT_S_TIMER;
				2551	/* 4.096 usec. * (1 << qp->timeout) */
				2552	qp->s_timer.expires = jiffies + rdi->busy_jiffies +
				2553	(qp->timeout_jiffies << shift);
				2554	add_timer(&qp->s_timer);
				2555	}
				2556	EXPORT_SYMBOL(rvt_add_retry_timer_ext);
				2557
				2558	/**
				2559	* rvt_add_rnr_timer - add/start an rnr timer
				2560	* @qp - the QP
				2561	* @aeth - aeth of RNR timeout, simulated aeth for loopback
				2562	* add an rnr timer on the QP
				2563	*/
				2564	void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
				2565	{
				2566	u32 to;
				2567
				2568	lockdep_assert_held(&qp->s_lock);
				2569	qp->s_flags \|= RVT_S_WAIT_RNR;
				2570	to = rvt_aeth_to_usec(aeth);
				2571	trace_rvt_rnrnak_add(qp, to);
				2572	hrtimer_start(&qp->s_rnr_timer,
				2573	ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED);
				2574	}
				2575	EXPORT_SYMBOL(rvt_add_rnr_timer);
				2576
				2577	/**
				2578	* rvt_stop_rc_timers - stop all timers
				2579	* @qp - the QP
				2580	* stop any pending timers
				2581	*/
				2582	void rvt_stop_rc_timers(struct rvt_qp *qp)
				2583	{
				2584	lockdep_assert_held(&qp->s_lock);
				2585	/* Remove QP from all timers */
				2586	if (qp->s_flags & (RVT_S_TIMER \| RVT_S_WAIT_RNR)) {
				2587	qp->s_flags &= ~(RVT_S_TIMER \| RVT_S_WAIT_RNR);
				2588	del_timer(&qp->s_timer);
				2589	hrtimer_try_to_cancel(&qp->s_rnr_timer);
				2590	}
				2591	}
				2592	EXPORT_SYMBOL(rvt_stop_rc_timers);
				2593
				2594	/**
				2595	* rvt_stop_rnr_timer - stop an rnr timer
				2596	* @qp - the QP
				2597	*
				2598	* stop an rnr timer and return if the timer
				2599	* had been pending.
				2600	*/
				2601	static void rvt_stop_rnr_timer(struct rvt_qp *qp)
				2602	{
				2603	lockdep_assert_held(&qp->s_lock);
				2604	/* Remove QP from rnr timer */
				2605	if (qp->s_flags & RVT_S_WAIT_RNR) {
				2606	qp->s_flags &= ~RVT_S_WAIT_RNR;
				2607	trace_rvt_rnrnak_stop(qp, 0);
				2608	}
				2609	}
				2610
				2611	/**
				2612	* rvt_del_timers_sync - wait for any timeout routines to exit
				2613	* @qp - the QP
				2614	*/
				2615	void rvt_del_timers_sync(struct rvt_qp *qp)
				2616	{
				2617	del_timer_sync(&qp->s_timer);
				2618	hrtimer_cancel(&qp->s_rnr_timer);
				2619	}
				2620	EXPORT_SYMBOL(rvt_del_timers_sync);
				2621
				2622	/**
				2623	* This is called from s_timer for missing responses.
				2624	*/
				2625	static void rvt_rc_timeout(struct timer_list *t)
				2626	{
				2627	struct rvt_qp *qp = from_timer(qp, t, s_timer);
				2628	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				2629	unsigned long flags;
				2630
				2631	spin_lock_irqsave(&qp->r_lock, flags);
				2632	spin_lock(&qp->s_lock);
				2633	if (qp->s_flags & RVT_S_TIMER) {
				2634	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
				2635
				2636	qp->s_flags &= ~RVT_S_TIMER;
				2637	rvp->n_rc_timeouts++;
				2638	del_timer(&qp->s_timer);
				2639	trace_rvt_rc_timeout(qp, qp->s_last_psn + 1);
				2640	if (rdi->driver_f.notify_restart_rc)
				2641	rdi->driver_f.notify_restart_rc(qp,
				2642	qp->s_last_psn + 1,
				2643	1);
				2644	rdi->driver_f.schedule_send(qp);
				2645	}
				2646	spin_unlock(&qp->s_lock);
				2647	spin_unlock_irqrestore(&qp->r_lock, flags);
				2648	}
				2649
				2650	/*
				2651	* This is called from s_timer for RNR timeouts.
				2652	*/
				2653	enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
				2654	{
				2655	struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
				2656	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				2657	unsigned long flags;
				2658
				2659	spin_lock_irqsave(&qp->s_lock, flags);
				2660	rvt_stop_rnr_timer(qp);
				2661	trace_rvt_rnrnak_timeout(qp, 0);
				2662	rdi->driver_f.schedule_send(qp);
				2663	spin_unlock_irqrestore(&qp->s_lock, flags);
				2664	return HRTIMER_NORESTART;
				2665	}
				2666	EXPORT_SYMBOL(rvt_rc_rnr_retry);
				2667
				2668	/**
				2669	* rvt_qp_iter_init - initial for QP iteration
				2670	* @rdi: rvt devinfo
				2671	* @v: u64 value
				2672	*
				2673	* This returns an iterator suitable for iterating QPs
				2674	* in the system.
				2675	*
				2676	* The @cb is a user defined callback and @v is a 64
				2677	* bit value passed to and relevant for processing in the
				2678	* @cb. An example use case would be to alter QP processing
				2679	* based on criteria not part of the rvt_qp.
				2680	*
				2681	* Use cases that require memory allocation to succeed
				2682	* must preallocate appropriately.
				2683	*
				2684	* Return: a pointer to an rvt_qp_iter or NULL
				2685	*/
				2686	struct rvt_qp_iter rvt_qp_iter_init(struct rvt_dev_info rdi,
				2687	u64 v,
				2688	void (cb)(struct rvt_qp qp, u64 v))
				2689	{
				2690	struct rvt_qp_iter *i;
				2691
				2692	i = kzalloc(sizeof(*i), GFP_KERNEL);
				2693	if (!i)
				2694	return NULL;
				2695
				2696	i->rdi = rdi;
				2697	/* number of special QPs (SMI/GSI) for device */
				2698	i->specials = rdi->ibdev.phys_port_cnt * 2;
				2699	i->v = v;
				2700	i->cb = cb;
				2701
				2702	return i;
				2703	}
				2704	EXPORT_SYMBOL(rvt_qp_iter_init);
				2705
				2706	/**
				2707	* rvt_qp_iter_next - return the next QP in iter
				2708	* @iter - the iterator
				2709	*
				2710	* Fine grained QP iterator suitable for use
				2711	* with debugfs seq_file mechanisms.
				2712	*
				2713	* Updates iter->qp with the current QP when the return
				2714	* value is 0.
				2715	*
				2716	* Return: 0 - iter->qp is valid 1 - no more QPs
				2717	*/
				2718	int rvt_qp_iter_next(struct rvt_qp_iter *iter)
				2719	__must_hold(RCU)
				2720	{
				2721	int n = iter->n;
				2722	int ret = 1;
				2723	struct rvt_qp *pqp = iter->qp;
				2724	struct rvt_qp *qp;
				2725	struct rvt_dev_info *rdi = iter->rdi;
				2726
				2727	/*
				2728	* The approach is to consider the special qps
				2729	* as additional table entries before the
				2730	* real hash table. Since the qp code sets
				2731	* the qp->next hash link to NULL, this works just fine.
				2732	*
				2733	* iter->specials is 2 * # ports
				2734	*
				2735	* n = 0..iter->specials is the special qp indices
				2736	*
				2737	* n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
				2738	* the potential hash bucket entries
				2739	*
				2740	*/
				2741	for (; n < rdi->qp_dev->qp_table_size + iter->specials; n++) {
				2742	if (pqp) {
				2743	qp = rcu_dereference(pqp->next);
				2744	} else {
				2745	if (n < iter->specials) {
				2746	struct rvt_ibport *rvp;
				2747	int pidx;
				2748
				2749	pidx = n % rdi->ibdev.phys_port_cnt;
				2750	rvp = rdi->ports[pidx];
				2751	qp = rcu_dereference(rvp->qp[n & 1]);
				2752	} else {
				2753	qp = rcu_dereference(
				2754	rdi->qp_dev->qp_table[
				2755	(n - iter->specials)]);
				2756	}
				2757	}
				2758	pqp = qp;
				2759	if (qp) {
				2760	iter->qp = qp;
				2761	iter->n = n;
				2762	return 0;
				2763	}
				2764	}
				2765	return ret;
				2766	}
				2767	EXPORT_SYMBOL(rvt_qp_iter_next);
				2768
				2769	/**
				2770	* rvt_qp_iter - iterate all QPs
				2771	* @rdi - rvt devinfo
				2772	* @v - a 64 bit value
				2773	* @cb - a callback
				2774	*
				2775	* This provides a way for iterating all QPs.
				2776	*
				2777	* The @cb is a user defined callback and @v is a 64
				2778	* bit value passed to and relevant for processing in the
				2779	* cb. An example use case would be to alter QP processing
				2780	* based on criteria not part of the rvt_qp.
				2781	*
				2782	* The code has an internal iterator to simplify
				2783	* non seq_file use cases.
				2784	*/
				2785	void rvt_qp_iter(struct rvt_dev_info *rdi,
				2786	u64 v,
				2787	void (cb)(struct rvt_qp qp, u64 v))
				2788	{
				2789	int ret;
				2790	struct rvt_qp_iter i = {
				2791	.rdi = rdi,
				2792	.specials = rdi->ibdev.phys_port_cnt * 2,
				2793	.v = v,
				2794	.cb = cb
				2795	};
				2796
				2797	rcu_read_lock();
				2798	do {
				2799	ret = rvt_qp_iter_next(&i);
				2800	if (!ret) {
				2801	rvt_get_qp(i.qp);
				2802	rcu_read_unlock();
				2803	i.cb(i.qp, i.v);
				2804	rcu_read_lock();
				2805	rvt_put_qp(i.qp);
				2806	}
				2807	} while (!ret);
				2808	rcu_read_unlock();
				2809	}
				2810	EXPORT_SYMBOL(rvt_qp_iter);
				2811
				2812	/*
				2813	* This should be called with s_lock and r_lock held.
				2814	*/
				2815	void rvt_send_complete(struct rvt_qp qp, struct rvt_swqe wqe,
				2816	enum ib_wc_status status)
				2817	{
				2818	u32 old_last, last;
				2819	struct rvt_dev_info *rdi;
				2820
				2821	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
				2822	return;
				2823	rdi = ib_to_rvt(qp->ibqp.device);
				2824
				2825	old_last = qp->s_last;
				2826	trace_rvt_qp_send_completion(qp, wqe, old_last);
				2827	last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
				2828	status);
				2829	if (qp->s_acked == old_last)
				2830	qp->s_acked = last;
				2831	if (qp->s_cur == old_last)
				2832	qp->s_cur = last;
				2833	if (qp->s_tail == old_last)
				2834	qp->s_tail = last;
				2835	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
				2836	qp->s_draining = 0;
				2837	}
				2838	EXPORT_SYMBOL(rvt_send_complete);
				2839
				2840	/**
				2841	* rvt_copy_sge - copy data to SGE memory
				2842	* @qp: associated QP
				2843	* @ss: the SGE state
				2844	* @data: the data to copy
				2845	* @length: the length of the data
				2846	* @release: boolean to release MR
				2847	* @copy_last: do a separate copy of the last 8 bytes
				2848	*/
				2849	void rvt_copy_sge(struct rvt_qp qp, struct rvt_sge_state ss,
				2850	void *data, u32 length,
				2851	bool release, bool copy_last)
				2852	{
				2853	struct rvt_sge *sge = &ss->sge;
				2854	int i;
				2855	bool in_last = false;
				2856	bool cacheless_copy = false;
				2857	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
				2858	struct rvt_wss *wss = rdi->wss;
				2859	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
				2860
				2861	if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
				2862	cacheless_copy = length >= PAGE_SIZE;
				2863	} else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
				2864	if (length >= PAGE_SIZE) {
				2865	/*
				2866	* NOTE: this assumes:
				2867	* o The first vaddr is the dest.
				2868	* o If multiple pages, then vaddr is sequential.
				2869	*/
				2870	wss_insert(wss, sge->vaddr);
				2871	if (length >= (2 * PAGE_SIZE))
				2872	wss_insert(wss, (sge->vaddr + PAGE_SIZE));
				2873
				2874	cacheless_copy = wss_exceeds_threshold(wss);
				2875	} else {
				2876	wss_advance_clean_counter(wss);
				2877	}
				2878	}
				2879
				2880	if (copy_last) {
				2881	if (length > 8) {
				2882	length -= 8;
				2883	} else {
				2884	copy_last = false;
				2885	in_last = true;
				2886	}
				2887	}
				2888
				2889	again:
				2890	while (length) {
				2891	u32 len = rvt_get_sge_length(sge, length);
				2892
				2893	WARN_ON_ONCE(len == 0);
				2894	if (unlikely(in_last)) {
				2895	/* enforce byte transfer ordering */
				2896	for (i = 0; i < len; i++)
				2897	((u8 )sge->vaddr)[i] = ((u8 )data)[i];
				2898	} else if (cacheless_copy) {
				2899	cacheless_memcpy(sge->vaddr, data, len);
				2900	} else {
				2901	memcpy(sge->vaddr, data, len);
				2902	}
				2903	rvt_update_sge(ss, len, release);
				2904	data += len;
				2905	length -= len;
				2906	}
				2907
				2908	if (copy_last) {
				2909	copy_last = false;
				2910	in_last = true;
				2911	length = 8;
				2912	goto again;
				2913	}
				2914	}
				2915	EXPORT_SYMBOL(rvt_copy_sge);
				2916
				2917	static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
				2918	struct rvt_qp *sqp)
				2919	{
				2920	rvp->n_pkt_drops++;
				2921	/*
				2922	* For RC, the requester would timeout and retry so
				2923	* shortcut the timeouts and just signal too many retries.
				2924	*/
				2925	return sqp->ibqp.qp_type == IB_QPT_RC ?
				2926	IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
				2927	}
				2928
				2929	/**
				2930	* ruc_loopback - handle UC and RC loopback requests
				2931	* @sqp: the sending QP
				2932	*
				2933	* This is called from rvt_do_send() to forward a WQE addressed to the same HFI
				2934	* Note that although we are single threaded due to the send engine, we still
				2935	* have to protect against post_send(). We don't have to worry about
				2936	* receive interrupts since this is a connected protocol and all packets
				2937	* will pass through here.
				2938	*/
				2939	void rvt_ruc_loopback(struct rvt_qp *sqp)
				2940	{
				2941	struct rvt_ibport *rvp = NULL;
				2942	struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
				2943	struct rvt_qp *qp;
				2944	struct rvt_swqe *wqe;
				2945	struct rvt_sge *sge;
				2946	unsigned long flags;
				2947	struct ib_wc wc;
				2948	u64 sdata;
				2949	atomic64_t *maddr;
				2950	enum ib_wc_status send_status;
				2951	bool release;
				2952	int ret;
				2953	bool copy_last = false;
				2954	int local_ops = 0;
				2955
				2956	rcu_read_lock();
				2957	rvp = rdi->ports[sqp->port_num - 1];
				2958
				2959	/*
				2960	* Note that we check the responder QP state after
				2961	* checking the requester's state.
				2962	*/
				2963
				2964	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
				2965	sqp->remote_qpn);
				2966
				2967	spin_lock_irqsave(&sqp->s_lock, flags);
				2968
				2969	/* Return if we are already busy processing a work request. */
				2970	if ((sqp->s_flags & (RVT_S_BUSY \| RVT_S_ANY_WAIT)) \|\|
				2971	!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
				2972	goto unlock;
				2973
				2974	sqp->s_flags \|= RVT_S_BUSY;
				2975
				2976	again:
				2977	if (sqp->s_last == READ_ONCE(sqp->s_head))
				2978	goto clr_busy;
				2979	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
				2980
				2981	/* Return if it is not OK to start a new work request. */
				2982	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
				2983	if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
				2984	goto clr_busy;
				2985	/* We are in the error state, flush the work request. */
				2986	send_status = IB_WC_WR_FLUSH_ERR;
				2987	goto flush_send;
				2988	}
				2989
				2990	/*
				2991	* We can rely on the entry not changing without the s_lock
				2992	* being held until we update s_last.
				2993	* We increment s_cur to indicate s_last is in progress.
				2994	*/
				2995	if (sqp->s_last == sqp->s_cur) {
				2996	if (++sqp->s_cur >= sqp->s_size)
				2997	sqp->s_cur = 0;
				2998	}
				2999	spin_unlock_irqrestore(&sqp->s_lock, flags);
				3000
				3001	if (!qp) {
				3002	send_status = loopback_qp_drop(rvp, sqp);
				3003	goto serr_no_r_lock;
				3004	}
				3005	spin_lock_irqsave(&qp->r_lock, flags);
				3006	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) \|\|
				3007	qp->ibqp.qp_type != sqp->ibqp.qp_type) {
				3008	send_status = loopback_qp_drop(rvp, sqp);
				3009	goto serr;
				3010	}
				3011
				3012	memset(&wc, 0, sizeof(wc));
				3013	send_status = IB_WC_SUCCESS;
				3014
				3015	release = true;
				3016	sqp->s_sge.sge = wqe->sg_list[0];
				3017	sqp->s_sge.sg_list = wqe->sg_list + 1;
				3018	sqp->s_sge.num_sge = wqe->wr.num_sge;
				3019	sqp->s_len = wqe->length;
				3020	switch (wqe->wr.opcode) {
				3021	case IB_WR_REG_MR:
				3022	goto send_comp;
				3023
				3024	case IB_WR_LOCAL_INV:
				3025	if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
				3026	if (rvt_invalidate_rkey(sqp,
				3027	wqe->wr.ex.invalidate_rkey))
				3028	send_status = IB_WC_LOC_PROT_ERR;
				3029	local_ops = 1;
				3030	}
				3031	goto send_comp;
				3032
				3033	case IB_WR_SEND_WITH_INV:
				3034	case IB_WR_SEND_WITH_IMM:
				3035	case IB_WR_SEND:
				3036	ret = rvt_get_rwqe(qp, false);
				3037	if (ret < 0)
				3038	goto op_err;
				3039	if (!ret)
				3040	goto rnr_nak;
				3041	if (wqe->length > qp->r_len)
				3042	goto inv_err;
				3043	switch (wqe->wr.opcode) {
				3044	case IB_WR_SEND_WITH_INV:
				3045	if (!rvt_invalidate_rkey(qp,
				3046	wqe->wr.ex.invalidate_rkey)) {
				3047	wc.wc_flags = IB_WC_WITH_INVALIDATE;
				3048	wc.ex.invalidate_rkey =
				3049	wqe->wr.ex.invalidate_rkey;
				3050	}
				3051	break;
				3052	case IB_WR_SEND_WITH_IMM:
				3053	wc.wc_flags = IB_WC_WITH_IMM;
				3054	wc.ex.imm_data = wqe->wr.ex.imm_data;
				3055	break;
				3056	default:
				3057	break;
				3058	}
				3059	break;
				3060
				3061	case IB_WR_RDMA_WRITE_WITH_IMM:
				3062	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
				3063	goto inv_err;
				3064	wc.wc_flags = IB_WC_WITH_IMM;
				3065	wc.ex.imm_data = wqe->wr.ex.imm_data;
				3066	ret = rvt_get_rwqe(qp, true);
				3067	if (ret < 0)
				3068	goto op_err;
				3069	if (!ret)
				3070	goto rnr_nak;
				3071	/* skip copy_last set and qp_access_flags recheck */
				3072	goto do_write;
				3073	case IB_WR_RDMA_WRITE:
				3074	copy_last = rvt_is_user_qp(qp);
				3075	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
				3076	goto inv_err;
				3077	do_write:
				3078	if (wqe->length == 0)
				3079	break;
				3080	if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
				3081	wqe->rdma_wr.remote_addr,
				3082	wqe->rdma_wr.rkey,
				3083	IB_ACCESS_REMOTE_WRITE)))
				3084	goto acc_err;
				3085	qp->r_sge.sg_list = NULL;
				3086	qp->r_sge.num_sge = 1;
				3087	qp->r_sge.total_len = wqe->length;
				3088	break;
				3089
				3090	case IB_WR_RDMA_READ:
				3091	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
				3092	goto inv_err;
				3093	if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
				3094	wqe->rdma_wr.remote_addr,
				3095	wqe->rdma_wr.rkey,
				3096	IB_ACCESS_REMOTE_READ)))
				3097	goto acc_err;
				3098	release = false;
				3099	sqp->s_sge.sg_list = NULL;
				3100	sqp->s_sge.num_sge = 1;
				3101	qp->r_sge.sge = wqe->sg_list[0];
				3102	qp->r_sge.sg_list = wqe->sg_list + 1;
				3103	qp->r_sge.num_sge = wqe->wr.num_sge;
				3104	qp->r_sge.total_len = wqe->length;
				3105	break;
				3106
				3107	case IB_WR_ATOMIC_CMP_AND_SWP:
				3108	case IB_WR_ATOMIC_FETCH_AND_ADD:
				3109	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
				3110	goto inv_err;
				3111	if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
				3112	goto inv_err;
				3113	if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
				3114	wqe->atomic_wr.remote_addr,
				3115	wqe->atomic_wr.rkey,
				3116	IB_ACCESS_REMOTE_ATOMIC)))
				3117	goto acc_err;
				3118	/* Perform atomic OP and save result. */
				3119	maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
				3120	sdata = wqe->atomic_wr.compare_add;
				3121	(u64 )sqp->s_sge.sge.vaddr =
				3122	(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
				3123	(u64)atomic64_add_return(sdata, maddr) - sdata :
				3124	(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
				3125	sdata, wqe->atomic_wr.swap);
				3126	rvt_put_mr(qp->r_sge.sge.mr);
				3127	qp->r_sge.num_sge = 0;
				3128	goto send_comp;
				3129
				3130	default:
				3131	send_status = IB_WC_LOC_QP_OP_ERR;
				3132	goto serr;
				3133	}
				3134
				3135	sge = &sqp->s_sge.sge;
				3136	while (sqp->s_len) {
				3137	u32 len = rvt_get_sge_length(sge, sqp->s_len);
				3138
				3139	WARN_ON_ONCE(len == 0);
				3140	rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
				3141	len, release, copy_last);
				3142	rvt_update_sge(&sqp->s_sge, len, !release);
				3143	sqp->s_len -= len;
				3144	}
				3145	if (release)
				3146	rvt_put_ss(&qp->r_sge);
				3147
				3148	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
				3149	goto send_comp;
				3150
				3151	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
				3152	wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
				3153	else
				3154	wc.opcode = IB_WC_RECV;
				3155	wc.wr_id = qp->r_wr_id;
				3156	wc.status = IB_WC_SUCCESS;
				3157	wc.byte_len = wqe->length;
				3158	wc.qp = &qp->ibqp;
				3159	wc.src_qp = qp->remote_qpn;
				3160	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
				3161	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
				3162	wc.port_num = 1;
				3163	/* Signal completion event if the solicited bit is set. */
				3164	rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
				3165
				3166	send_comp:
				3167	spin_unlock_irqrestore(&qp->r_lock, flags);
				3168	spin_lock_irqsave(&sqp->s_lock, flags);
				3169	rvp->n_loop_pkts++;
				3170	flush_send:
				3171	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
				3172	spin_lock(&sqp->r_lock);
				3173	rvt_send_complete(sqp, wqe, send_status);
				3174	spin_unlock(&sqp->r_lock);
				3175	if (local_ops) {
				3176	atomic_dec(&sqp->local_ops_pending);
				3177	local_ops = 0;
				3178	}
				3179	goto again;
				3180
				3181	rnr_nak:
				3182	/* Handle RNR NAK */
				3183	if (qp->ibqp.qp_type == IB_QPT_UC)
				3184	goto send_comp;
				3185	rvp->n_rnr_naks++;
				3186	/*
				3187	* Note: we don't need the s_lock held since the BUSY flag
				3188	* makes this single threaded.
				3189	*/
				3190	if (sqp->s_rnr_retry == 0) {
				3191	send_status = IB_WC_RNR_RETRY_EXC_ERR;
				3192	goto serr;
				3193	}
				3194	if (sqp->s_rnr_retry_cnt < 7)
				3195	sqp->s_rnr_retry--;
				3196	spin_unlock_irqrestore(&qp->r_lock, flags);
				3197	spin_lock_irqsave(&sqp->s_lock, flags);
				3198	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
				3199	goto clr_busy;
				3200	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
				3201	IB_AETH_CREDIT_SHIFT);
				3202	goto clr_busy;
				3203
				3204	op_err:
				3205	send_status = IB_WC_REM_OP_ERR;
				3206	wc.status = IB_WC_LOC_QP_OP_ERR;
				3207	goto err;
				3208
				3209	inv_err:
				3210	send_status =
				3211	sqp->ibqp.qp_type == IB_QPT_RC ?
				3212	IB_WC_REM_INV_REQ_ERR :
				3213	IB_WC_SUCCESS;
				3214	wc.status = IB_WC_LOC_QP_OP_ERR;
				3215	goto err;
				3216
				3217	acc_err:
				3218	send_status = IB_WC_REM_ACCESS_ERR;
				3219	wc.status = IB_WC_LOC_PROT_ERR;
				3220	err:
				3221	/* responder goes to error state */
				3222	rvt_rc_error(qp, wc.status);
				3223
				3224	serr:
				3225	spin_unlock_irqrestore(&qp->r_lock, flags);
				3226	serr_no_r_lock:
				3227	spin_lock_irqsave(&sqp->s_lock, flags);
				3228	spin_lock(&sqp->r_lock);
				3229	rvt_send_complete(sqp, wqe, send_status);
				3230	spin_unlock(&sqp->r_lock);
				3231	if (sqp->ibqp.qp_type == IB_QPT_RC) {
				3232	int lastwqe;
				3233
				3234	spin_lock(&sqp->r_lock);
				3235	lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
				3236	spin_unlock(&sqp->r_lock);
				3237
				3238	sqp->s_flags &= ~RVT_S_BUSY;
				3239	spin_unlock_irqrestore(&sqp->s_lock, flags);
				3240	if (lastwqe) {
				3241	struct ib_event ev;
				3242
				3243	ev.device = sqp->ibqp.device;
				3244	ev.element.qp = &sqp->ibqp;
				3245	ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
				3246	sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
				3247	}
				3248	goto done;
				3249	}
				3250	clr_busy:
				3251	sqp->s_flags &= ~RVT_S_BUSY;
				3252	unlock:
				3253	spin_unlock_irqrestore(&sqp->s_lock, flags);
				3254	done:
				3255	rcu_read_unlock();
				3256	}
				3257	EXPORT_SYMBOL(rvt_ruc_loopback);