Blame - marvell/linux/drivers/infiniband/hw/hfi1/user_sdma.c - T108

blob: a92346e88628bd0a9d214df2b58e0aaf4e891ec9 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Copyright(c) 2015 - 2018 Intel Corporation.
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of version 2 of the GNU General Public License as
				11	* published by the Free Software Foundation.
				12	*
				13	* This program is distributed in the hope that it will be useful, but
				14	* WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				16	* General Public License for more details.
				17	*
				18	* BSD LICENSE
				19	*
				20	* Redistribution and use in source and binary forms, with or without
				21	* modification, are permitted provided that the following conditions
				22	* are met:
				23	*
				24	* - Redistributions of source code must retain the above copyright
				25	* notice, this list of conditions and the following disclaimer.
				26	* - Redistributions in binary form must reproduce the above copyright
				27	* notice, this list of conditions and the following disclaimer in
				28	* the documentation and/or other materials provided with the
				29	* distribution.
				30	* - Neither the name of Intel Corporation nor the names of its
				31	* contributors may be used to endorse or promote products derived
				32	* from this software without specific prior written permission.
				33	*
				34	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				35	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				36	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				37	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				38	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				39	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				40	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				41	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				42	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				43	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				44	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				45	*
				46	*/
				47	#include <linux/mm.h>
				48	#include <linux/types.h>
				49	#include <linux/device.h>
				50	#include <linux/dmapool.h>
				51	#include <linux/slab.h>
				52	#include <linux/list.h>
				53	#include <linux/highmem.h>
				54	#include <linux/io.h>
				55	#include <linux/uio.h>
				56	#include <linux/rbtree.h>
				57	#include <linux/spinlock.h>
				58	#include <linux/delay.h>
				59	#include <linux/kthread.h>
				60	#include <linux/mmu_context.h>
				61	#include <linux/module.h>
				62	#include <linux/vmalloc.h>
				63	#include <linux/string.h>
				64
				65	#include "hfi.h"
				66	#include "sdma.h"
				67	#include "mmu_rb.h"
				68	#include "user_sdma.h"
				69	#include "verbs.h" /* for the headers */
				70	#include "common.h" /* for struct hfi1_tid_info */
				71	#include "trace.h"
				72
				73	static uint hfi1_sdma_comp_ring_size = 128;
				74	module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
				75	MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
				76
				77	static unsigned initial_pkt_count = 8;
				78
				79	static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
				80	static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
				81	static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
				82	static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
				83	static int pin_vector_pages(struct user_sdma_request *req,
				84	struct user_sdma_iovec *iovec);
				85	static void unpin_vector_pages(struct mm_struct mm, struct page *pages,
				86	unsigned start, unsigned npages);
				87	static int check_header_template(struct user_sdma_request *req,
				88	struct hfi1_pkt_header *hdr, u32 lrhlen,
				89	u32 datalen);
				90	static int set_txreq_header(struct user_sdma_request *req,
				91	struct user_sdma_txreq *tx, u32 datalen);
				92	static int set_txreq_header_ahg(struct user_sdma_request *req,
				93	struct user_sdma_txreq *tx, u32 len);
				94	static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
				95	struct hfi1_user_sdma_comp_q *cq,
				96	u16 idx, enum hfi1_sdma_comp_state state,
				97	int ret);
				98	static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
				99	static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
				100
				101	static int defer_packet_queue(
				102	struct sdma_engine *sde,
				103	struct iowait_work *wait,
				104	struct sdma_txreq *txreq,
				105	uint seq,
				106	bool pkts_sent);
				107	static void activate_packet_queue(struct iowait *wait, int reason);
				108	static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
				109	unsigned long len);
				110	static int sdma_rb_insert(void arg, struct mmu_rb_node mnode);
				111	static int sdma_rb_evict(void arg, struct mmu_rb_node mnode,
				112	void arg2, bool stop);
				113	static void sdma_rb_remove(void arg, struct mmu_rb_node mnode);
				114	static int sdma_rb_invalidate(void arg, struct mmu_rb_node mnode);
				115
				116	static struct mmu_rb_ops sdma_rb_ops = {
				117	.filter = sdma_rb_filter,
				118	.insert = sdma_rb_insert,
				119	.evict = sdma_rb_evict,
				120	.remove = sdma_rb_remove,
				121	.invalidate = sdma_rb_invalidate
				122	};
				123
				124	static int defer_packet_queue(
				125	struct sdma_engine *sde,
				126	struct iowait_work *wait,
				127	struct sdma_txreq *txreq,
				128	uint seq,
				129	bool pkts_sent)
				130	{
				131	struct hfi1_user_sdma_pkt_q *pq =
				132	container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
				133
				134	write_seqlock(&sde->waitlock);
				135	if (sdma_progress(sde, seq, txreq))
				136	goto eagain;
				137	/*
				138	* We are assuming that if the list is enqueued somewhere, it
				139	* is to the dmawait list since that is the only place where
				140	* it is supposed to be enqueued.
				141	*/
				142	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
				143	if (list_empty(&pq->busy.list)) {
				144	pq->busy.lock = &sde->waitlock;
				145	iowait_get_priority(&pq->busy);
				146	iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
				147	}
				148	write_sequnlock(&sde->waitlock);
				149	return -EBUSY;
				150	eagain:
				151	write_sequnlock(&sde->waitlock);
				152	return -EAGAIN;
				153	}
				154
				155	static void activate_packet_queue(struct iowait *wait, int reason)
				156	{
				157	struct hfi1_user_sdma_pkt_q *pq =
				158	container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
				159	pq->busy.lock = NULL;
				160	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
				161	wake_up(&wait->wait_dma);
				162	};
				163
				164	int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
				165	struct hfi1_filedata *fd)
				166	{
				167	int ret = -ENOMEM;
				168	char buf[64];
				169	struct hfi1_devdata *dd;
				170	struct hfi1_user_sdma_comp_q *cq;
				171	struct hfi1_user_sdma_pkt_q *pq;
				172
				173	if (!uctxt \|\| !fd)
				174	return -EBADF;
				175
				176	if (!hfi1_sdma_comp_ring_size)
				177	return -EINVAL;
				178
				179	dd = uctxt->dd;
				180
				181	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
				182	if (!pq)
				183	return -ENOMEM;
				184	pq->dd = dd;
				185	pq->ctxt = uctxt->ctxt;
				186	pq->subctxt = fd->subctxt;
				187	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
				188	atomic_set(&pq->n_reqs, 0);
				189	init_waitqueue_head(&pq->wait);
				190	atomic_set(&pq->n_locked, 0);
				191	pq->mm = fd->mm;
				192
				193	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
				194	activate_packet_queue, NULL, NULL);
				195	pq->reqidx = 0;
				196
				197	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
				198	sizeof(*pq->reqs),
				199	GFP_KERNEL);
				200	if (!pq->reqs)
				201	goto pq_reqs_nomem;
				202
				203	pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
				204	sizeof(*pq->req_in_use),
				205	GFP_KERNEL);
				206	if (!pq->req_in_use)
				207	goto pq_reqs_no_in_use;
				208
				209	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
				210	fd->subctxt);
				211	pq->txreq_cache = kmem_cache_create(buf,
				212	sizeof(struct user_sdma_txreq),
				213	L1_CACHE_BYTES,
				214	SLAB_HWCACHE_ALIGN,
				215	NULL);
				216	if (!pq->txreq_cache) {
				217	dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
				218	uctxt->ctxt);
				219	goto pq_txreq_nomem;
				220	}
				221
				222	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
				223	if (!cq)
				224	goto cq_nomem;
				225
				226	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
				227	* hfi1_sdma_comp_ring_size));
				228	if (!cq->comps)
				229	goto cq_comps_nomem;
				230
				231	cq->nentries = hfi1_sdma_comp_ring_size;
				232
				233	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
				234	&pq->handler);
				235	if (ret) {
				236	dd_dev_err(dd, "Failed to register with MMU %d", ret);
				237	goto pq_mmu_fail;
				238	}
				239
				240	rcu_assign_pointer(fd->pq, pq);
				241	fd->cq = cq;
				242
				243	return 0;
				244
				245	pq_mmu_fail:
				246	vfree(cq->comps);
				247	cq_comps_nomem:
				248	kfree(cq);
				249	cq_nomem:
				250	kmem_cache_destroy(pq->txreq_cache);
				251	pq_txreq_nomem:
				252	kfree(pq->req_in_use);
				253	pq_reqs_no_in_use:
				254	kfree(pq->reqs);
				255	pq_reqs_nomem:
				256	kfree(pq);
				257
				258	return ret;
				259	}
				260
				261	static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
				262	{
				263	unsigned long flags;
				264	seqlock_t *lock = pq->busy.lock;
				265
				266	if (!lock)
				267	return;
				268	write_seqlock_irqsave(lock, flags);
				269	if (!list_empty(&pq->busy.list)) {
				270	list_del_init(&pq->busy.list);
				271	pq->busy.lock = NULL;
				272	}
				273	write_sequnlock_irqrestore(lock, flags);
				274	}
				275
				276	int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
				277	struct hfi1_ctxtdata *uctxt)
				278	{
				279	struct hfi1_user_sdma_pkt_q *pq;
				280
				281	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
				282
				283	spin_lock(&fd->pq_rcu_lock);
				284	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
				285	lockdep_is_held(&fd->pq_rcu_lock));
				286	if (pq) {
				287	rcu_assign_pointer(fd->pq, NULL);
				288	spin_unlock(&fd->pq_rcu_lock);
				289	synchronize_srcu(&fd->pq_srcu);
				290	/* at this point there can be no more new requests */
				291	if (pq->handler)
				292	hfi1_mmu_rb_unregister(pq->handler);
				293	iowait_sdma_drain(&pq->busy);
				294	/* Wait until all requests have been freed. */
				295	wait_event_interruptible(
				296	pq->wait,
				297	!atomic_read(&pq->n_reqs));
				298	kfree(pq->reqs);
				299	kfree(pq->req_in_use);
				300	kmem_cache_destroy(pq->txreq_cache);
				301	flush_pq_iowait(pq);
				302	kfree(pq);
				303	} else {
				304	spin_unlock(&fd->pq_rcu_lock);
				305	}
				306	if (fd->cq) {
				307	vfree(fd->cq->comps);
				308	kfree(fd->cq);
				309	fd->cq = NULL;
				310	}
				311	return 0;
				312	}
				313
				314	static u8 dlid_to_selector(u16 dlid)
				315	{
				316	static u8 mapping[256];
				317	static int initialized;
				318	static u8 next;
				319	int hash;
				320
				321	if (!initialized) {
				322	memset(mapping, 0xFF, 256);
				323	initialized = 1;
				324	}
				325
				326	hash = ((dlid >> 8) ^ dlid) & 0xFF;
				327	if (mapping[hash] == 0xFF) {
				328	mapping[hash] = next;
				329	next = (next + 1) & 0x7F;
				330	}
				331
				332	return mapping[hash];
				333	}
				334
				335	/**
				336	* hfi1_user_sdma_process_request() - Process and start a user sdma request
				337	* @fd: valid file descriptor
				338	* @iovec: array of io vectors to process
				339	* @dim: overall iovec array size
				340	* @count: number of io vector array entries processed
				341	*/
				342	int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
				343	struct iovec *iovec, unsigned long dim,
				344	unsigned long *count)
				345	{
				346	int ret = 0, i;
				347	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				348	struct hfi1_user_sdma_pkt_q *pq =
				349	srcu_dereference(fd->pq, &fd->pq_srcu);
				350	struct hfi1_user_sdma_comp_q *cq = fd->cq;
				351	struct hfi1_devdata *dd = pq->dd;
				352	unsigned long idx = 0;
				353	u8 pcount = initial_pkt_count;
				354	struct sdma_req_info info;
				355	struct user_sdma_request *req;
				356	u8 opcode, sc, vl;
				357	u16 pkey;
				358	u32 slid;
				359	u16 dlid;
				360	u32 selector;
				361
				362	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
				363	hfi1_cdbg(
				364	SDMA,
				365	"[%u:%u:%u] First vector not big enough for header %lu/%lu",
				366	dd->unit, uctxt->ctxt, fd->subctxt,
				367	iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
				368	return -EINVAL;
				369	}
				370	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
				371	if (ret) {
				372	hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
				373	dd->unit, uctxt->ctxt, fd->subctxt, ret);
				374	return -EFAULT;
				375	}
				376
				377	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
				378	(u16 *)&info);
				379	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
				380	hfi1_cdbg(SDMA,
				381	"[%u:%u:%u:%u] Invalid comp index",
				382	dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
				383	return -EINVAL;
				384	}
				385
				386	/*
				387	* Sanity check the header io vector count. Need at least 1 vector
				388	* (header) and cannot be larger than the actual io vector count.
				389	*/
				390	if (req_iovcnt(info.ctrl) < 1 \|\| req_iovcnt(info.ctrl) > dim) {
				391	hfi1_cdbg(SDMA,
				392	"[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
				393	dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
				394	req_iovcnt(info.ctrl), dim);
				395	return -EINVAL;
				396	}
				397
				398	if (!info.fragsize) {
				399	hfi1_cdbg(SDMA,
				400	"[%u:%u:%u:%u] Request does not specify fragsize",
				401	dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
				402	return -EINVAL;
				403	}
				404
				405	/* Try to claim the request. */
				406	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
				407	hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
				408	dd->unit, uctxt->ctxt, fd->subctxt,
				409	info.comp_idx);
				410	return -EBADSLT;
				411	}
				412	/*
				413	* All safety checks have been done and this request has been claimed.
				414	*/
				415	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
				416	info.comp_idx);
				417	req = pq->reqs + info.comp_idx;
				418	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
				419	req->data_len = 0;
				420	req->pq = pq;
				421	req->cq = cq;
				422	req->ahg_idx = -1;
				423	req->iov_idx = 0;
				424	req->sent = 0;
				425	req->seqnum = 0;
				426	req->seqcomp = 0;
				427	req->seqsubmitted = 0;
				428	req->tids = NULL;
				429	req->has_error = 0;
				430	INIT_LIST_HEAD(&req->txps);
				431
				432	memcpy(&req->info, &info, sizeof(info));
				433
				434	/* The request is initialized, count it */
				435	atomic_inc(&pq->n_reqs);
				436
				437	if (req_opcode(info.ctrl) == EXPECTED) {
				438	/* expected must have a TID info and at least one data vector */
				439	if (req->data_iovs < 2) {
				440	SDMA_DBG(req,
				441	"Not enough vectors for expected request");
				442	ret = -EINVAL;
				443	goto free_req;
				444	}
				445	req->data_iovs--;
				446	}
				447
				448	if (!info.npkts \|\| req->data_iovs > MAX_VECTORS_PER_REQ) {
				449	SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
				450	MAX_VECTORS_PER_REQ);
				451	ret = -EINVAL;
				452	goto free_req;
				453	}
				454	/* Copy the header from the user buffer */
				455	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
				456	sizeof(req->hdr));
				457	if (ret) {
				458	SDMA_DBG(req, "Failed to copy header template (%d)", ret);
				459	ret = -EFAULT;
				460	goto free_req;
				461	}
				462
				463	/* If Static rate control is not enabled, sanitize the header. */
				464	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
				465	req->hdr.pbc[2] = 0;
				466
				467	/* Validate the opcode. Do not trust packets from user space blindly. */
				468	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
				469	if ((opcode & USER_OPCODE_CHECK_MASK) !=
				470	USER_OPCODE_CHECK_VAL) {
				471	SDMA_DBG(req, "Invalid opcode (%d)", opcode);
				472	ret = -EINVAL;
				473	goto free_req;
				474	}
				475	/*
				476	* Validate the vl. Do not trust packets from user space blindly.
				477	* VL comes from PBC, SC comes from LRH, and the VL needs to
				478	* match the SC look up.
				479	*/
				480	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
				481	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) \|
				482	(((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
				483	if (vl >= dd->pport->vls_operational \|\|
				484	vl != sc_to_vlt(dd, sc)) {
				485	SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
				486	ret = -EINVAL;
				487	goto free_req;
				488	}
				489
				490	/* Checking P_KEY for requests from user-space */
				491	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
				492	slid = be16_to_cpu(req->hdr.lrh[3]);
				493	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
				494	ret = -EINVAL;
				495	goto free_req;
				496	}
				497
				498	/*
				499	* Also should check the BTH.lnh. If it says the next header is GRH then
				500	* the RXE parsing will be off and will land in the middle of the KDETH
				501	* or miss it entirely.
				502	*/
				503	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
				504	SDMA_DBG(req, "User tried to pass in a GRH");
				505	ret = -EINVAL;
				506	goto free_req;
				507	}
				508
				509	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
				510	/*
				511	* Calculate the initial TID offset based on the values of
				512	* KDETH.OFFSET and KDETH.OM that are passed in.
				513	*/
				514	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
				515	(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
				516	KDETH_OM_LARGE : KDETH_OM_SMALL);
				517	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
				518	info.comp_idx, req->tidoffset);
				519	idx++;
				520
				521	/* Save all the IO vector structures */
				522	for (i = 0; i < req->data_iovs; i++) {
				523	req->iovs[i].offset = 0;
				524	INIT_LIST_HEAD(&req->iovs[i].list);
				525	memcpy(&req->iovs[i].iov,
				526	iovec + idx++,
				527	sizeof(req->iovs[i].iov));
				528	ret = pin_vector_pages(req, &req->iovs[i]);
				529	if (ret) {
				530	req->data_iovs = i;
				531	goto free_req;
				532	}
				533	req->data_len += req->iovs[i].iov.iov_len;
				534	}
				535	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
				536	info.comp_idx, req->data_len);
				537	if (pcount > req->info.npkts)
				538	pcount = req->info.npkts;
				539	/*
				540	* Copy any TID info
				541	* User space will provide the TID info only when the
				542	* request type is EXPECTED. This is true even if there is
				543	* only one packet in the request and the header is already
				544	* setup. The reason for the singular TID case is that the
				545	* driver needs to perform safety checks.
				546	*/
				547	if (req_opcode(req->info.ctrl) == EXPECTED) {
				548	u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
				549	u32 *tmp;
				550
				551	if (!ntids \|\| ntids > MAX_TID_PAIR_ENTRIES) {
				552	ret = -EINVAL;
				553	goto free_req;
				554	}
				555
				556	/*
				557	* We have to copy all of the tids because they may vary
				558	* in size and, therefore, the TID count might not be
				559	* equal to the pkt count. However, there is no way to
				560	* tell at this point.
				561	*/
				562	tmp = memdup_user(iovec[idx].iov_base,
				563	ntids * sizeof(*req->tids));
				564	if (IS_ERR(tmp)) {
				565	ret = PTR_ERR(tmp);
				566	SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
				567	ntids, ret);
				568	goto free_req;
				569	}
				570	req->tids = tmp;
				571	req->n_tids = ntids;
				572	req->tididx = 0;
				573	idx++;
				574	}
				575
				576	dlid = be16_to_cpu(req->hdr.lrh[1]);
				577	selector = dlid_to_selector(dlid);
				578	selector += uctxt->ctxt + fd->subctxt;
				579	req->sde = sdma_select_user_engine(dd, selector, vl);
				580
				581	if (!req->sde \|\| !sdma_running(req->sde)) {
				582	ret = -ECOMM;
				583	goto free_req;
				584	}
				585
				586	/* We don't need an AHG entry if the request contains only one packet */
				587	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
				588	req->ahg_idx = sdma_ahg_alloc(req->sde);
				589
				590	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
				591	pq->state = SDMA_PKT_Q_ACTIVE;
				592
				593	/*
				594	* This is a somewhat blocking send implementation.
				595	* The driver will block the caller until all packets of the
				596	* request have been submitted to the SDMA engine. However, it
				597	* will not wait for send completions.
				598	*/
				599	while (req->seqsubmitted != req->info.npkts) {
				600	ret = user_sdma_send_pkts(req, pcount);
				601	if (ret < 0) {
				602	if (ret != -EBUSY)
				603	goto free_req;
				604	if (wait_event_interruptible_timeout(
				605	pq->busy.wait_dma,
				606	pq->state == SDMA_PKT_Q_ACTIVE,
				607	msecs_to_jiffies(
				608	SDMA_IOWAIT_TIMEOUT)) <= 0)
				609	flush_pq_iowait(pq);
				610	}
				611	}
				612	*count += idx;
				613	return 0;
				614	free_req:
				615	/*
				616	* If the submitted seqsubmitted == npkts, the completion routine
				617	* controls the final state. If sequbmitted < npkts, wait for any
				618	* outstanding packets to finish before cleaning up.
				619	*/
				620	if (req->seqsubmitted < req->info.npkts) {
				621	if (req->seqsubmitted)
				622	wait_event(pq->busy.wait_dma,
				623	(req->seqcomp == req->seqsubmitted - 1));
				624	user_sdma_free_request(req, true);
				625	pq_update(pq);
				626	set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
				627	}
				628	return ret;
				629	}
				630
				631	static inline u32 compute_data_length(struct user_sdma_request *req,
				632	struct user_sdma_txreq *tx)
				633	{
				634	/*
				635	* Determine the proper size of the packet data.
				636	* The size of the data of the first packet is in the header
				637	* template. However, it includes the header and ICRC, which need
				638	* to be subtracted.
				639	* The minimum representable packet data length in a header is 4 bytes,
				640	* therefore, when the data length request is less than 4 bytes, there's
				641	* only one packet, and the packet data length is equal to that of the
				642	* request data length.
				643	* The size of the remaining packets is the minimum of the frag
				644	* size (MTU) or remaining data in the request.
				645	*/
				646	u32 len;
				647
				648	if (!req->seqnum) {
				649	if (req->data_len < sizeof(u32))
				650	len = req->data_len;
				651	else
				652	len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
				653	(sizeof(tx->hdr) - 4));
				654	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
				655	u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
				656	PAGE_SIZE;
				657	/*
				658	* Get the data length based on the remaining space in the
				659	* TID pair.
				660	*/
				661	len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
				662	/* If we've filled up the TID pair, move to the next one. */
				663	if (unlikely(!len) && ++req->tididx < req->n_tids &&
				664	req->tids[req->tididx]) {
				665	tidlen = EXP_TID_GET(req->tids[req->tididx],
				666	LEN) * PAGE_SIZE;
				667	req->tidoffset = 0;
				668	len = min_t(u32, tidlen, req->info.fragsize);
				669	}
				670	/*
				671	* Since the TID pairs map entire pages, make sure that we
				672	* are not going to try to send more data that we have
				673	* remaining.
				674	*/
				675	len = min(len, req->data_len - req->sent);
				676	} else {
				677	len = min(req->data_len - req->sent, (u32)req->info.fragsize);
				678	}
				679	trace_hfi1_sdma_user_compute_length(req->pq->dd,
				680	req->pq->ctxt,
				681	req->pq->subctxt,
				682	req->info.comp_idx,
				683	len);
				684	return len;
				685	}
				686
				687	static inline u32 pad_len(u32 len)
				688	{
				689	if (len & (sizeof(u32) - 1))
				690	len += sizeof(u32) - (len & (sizeof(u32) - 1));
				691	return len;
				692	}
				693
				694	static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
				695	{
				696	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
				697	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
				698	}
				699
				700	static int user_sdma_txadd_ahg(struct user_sdma_request *req,
				701	struct user_sdma_txreq *tx,
				702	u32 datalen)
				703	{
				704	int ret;
				705	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
				706	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
				707	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				708
				709	/*
				710	* Copy the request header into the tx header
				711	* because the HW needs a cacheline-aligned
				712	* address.
				713	* This copy can be optimized out if the hdr
				714	* member of user_sdma_request were also
				715	* cacheline aligned.
				716	*/
				717	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
				718	if (PBC2LRH(pbclen) != lrhlen) {
				719	pbclen = (pbclen & 0xf000) \| LRH2PBC(lrhlen);
				720	tx->hdr.pbc[0] = cpu_to_le16(pbclen);
				721	}
				722	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
				723	if (ret)
				724	return ret;
				725	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
				726	sizeof(tx->hdr) + datalen, req->ahg_idx,
				727	0, NULL, 0, user_sdma_txreq_cb);
				728	if (ret)
				729	return ret;
				730	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
				731	if (ret)
				732	sdma_txclean(pq->dd, &tx->txreq);
				733	return ret;
				734	}
				735
				736	static int user_sdma_txadd(struct user_sdma_request *req,
				737	struct user_sdma_txreq *tx,
				738	struct user_sdma_iovec *iovec, u32 datalen,
				739	u32 queued_ptr, u32 data_sent_ptr,
				740	u64 *iov_offset_ptr)
				741	{
				742	int ret;
				743	unsigned int pageidx, len;
				744	unsigned long base, offset;
				745	u64 iov_offset = *iov_offset_ptr;
				746	u32 queued = queued_ptr, data_sent = data_sent_ptr;
				747	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				748
				749	base = (unsigned long)iovec->iov.iov_base;
				750	offset = offset_in_page(base + iovec->offset + iov_offset);
				751	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
				752	PAGE_SHIFT);
				753	len = offset + req->info.fragsize > PAGE_SIZE ?
				754	PAGE_SIZE - offset : req->info.fragsize;
				755	len = min((datalen - queued), len);
				756	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
				757	offset, len);
				758	if (ret) {
				759	SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
				760	return ret;
				761	}
				762	iov_offset += len;
				763	queued += len;
				764	data_sent += len;
				765	if (unlikely(queued < datalen && pageidx == iovec->npages &&
				766	req->iov_idx < req->data_iovs - 1)) {
				767	iovec->offset += iov_offset;
				768	iovec = &req->iovs[++req->iov_idx];
				769	iov_offset = 0;
				770	}
				771
				772	*queued_ptr = queued;
				773	*data_sent_ptr = data_sent;
				774	*iov_offset_ptr = iov_offset;
				775	return ret;
				776	}
				777
				778	static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
				779	{
				780	int ret = 0;
				781	u16 count;
				782	unsigned npkts = 0;
				783	struct user_sdma_txreq *tx = NULL;
				784	struct hfi1_user_sdma_pkt_q *pq = NULL;
				785	struct user_sdma_iovec *iovec = NULL;
				786
				787	if (!req->pq)
				788	return -EINVAL;
				789
				790	pq = req->pq;
				791
				792	/* If tx completion has reported an error, we are done. */
				793	if (READ_ONCE(req->has_error))
				794	return -EFAULT;
				795
				796	/*
				797	* Check if we might have sent the entire request already
				798	*/
				799	if (unlikely(req->seqnum == req->info.npkts)) {
				800	if (!list_empty(&req->txps))
				801	goto dosend;
				802	return ret;
				803	}
				804
				805	if (!maxpkts \|\| maxpkts > req->info.npkts - req->seqnum)
				806	maxpkts = req->info.npkts - req->seqnum;
				807
				808	while (npkts < maxpkts) {
				809	u32 datalen = 0, queued = 0, data_sent = 0;
				810	u64 iov_offset = 0;
				811
				812	/*
				813	* Check whether any of the completions have come back
				814	* with errors. If so, we are not going to process any
				815	* more packets from this request.
				816	*/
				817	if (READ_ONCE(req->has_error))
				818	return -EFAULT;
				819
				820	tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
				821	if (!tx)
				822	return -ENOMEM;
				823
				824	tx->flags = 0;
				825	tx->req = req;
				826	INIT_LIST_HEAD(&tx->list);
				827
				828	/*
				829	* For the last packet set the ACK request
				830	* and disable header suppression.
				831	*/
				832	if (req->seqnum == req->info.npkts - 1)
				833	tx->flags \|= (TXREQ_FLAGS_REQ_ACK \|
				834	TXREQ_FLAGS_REQ_DISABLE_SH);
				835
				836	/*
				837	* Calculate the payload size - this is min of the fragment
				838	* (MTU) size or the remaining bytes in the request but only
				839	* if we have payload data.
				840	*/
				841	if (req->data_len) {
				842	iovec = &req->iovs[req->iov_idx];
				843	if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
				844	if (++req->iov_idx == req->data_iovs) {
				845	ret = -EFAULT;
				846	goto free_tx;
				847	}
				848	iovec = &req->iovs[req->iov_idx];
				849	WARN_ON(iovec->offset);
				850	}
				851
				852	datalen = compute_data_length(req, tx);
				853
				854	/*
				855	* Disable header suppression for the payload <= 8DWS.
				856	* If there is an uncorrectable error in the receive
				857	* data FIFO when the received payload size is less than
				858	* or equal to 8DWS then the RxDmaDataFifoRdUncErr is
				859	* not reported.There is set RHF.EccErr if the header
				860	* is not suppressed.
				861	*/
				862	if (!datalen) {
				863	SDMA_DBG(req,
				864	"Request has data but pkt len is 0");
				865	ret = -EFAULT;
				866	goto free_tx;
				867	} else if (datalen <= 32) {
				868	tx->flags \|= TXREQ_FLAGS_REQ_DISABLE_SH;
				869	}
				870	}
				871
				872	if (req->ahg_idx >= 0) {
				873	if (!req->seqnum) {
				874	ret = user_sdma_txadd_ahg(req, tx, datalen);
				875	if (ret)
				876	goto free_tx;
				877	} else {
				878	int changes;
				879
				880	changes = set_txreq_header_ahg(req, tx,
				881	datalen);
				882	if (changes < 0) {
				883	ret = changes;
				884	goto free_tx;
				885	}
				886	}
				887	} else {
				888	ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
				889	datalen, user_sdma_txreq_cb);
				890	if (ret)
				891	goto free_tx;
				892	/*
				893	* Modify the header for this packet. This only needs
				894	* to be done if we are not going to use AHG. Otherwise,
				895	* the HW will do it based on the changes we gave it
				896	* during sdma_txinit_ahg().
				897	*/
				898	ret = set_txreq_header(req, tx, datalen);
				899	if (ret)
				900	goto free_txreq;
				901	}
				902
				903	/*
				904	* If the request contains any data vectors, add up to
				905	* fragsize bytes to the descriptor.
				906	*/
				907	while (queued < datalen &&
				908	(req->sent + data_sent) < req->data_len) {
				909	ret = user_sdma_txadd(req, tx, iovec, datalen,
				910	&queued, &data_sent, &iov_offset);
				911	if (ret)
				912	goto free_txreq;
				913	}
				914	/*
				915	* The txreq was submitted successfully so we can update
				916	* the counters.
				917	*/
				918	req->koffset += datalen;
				919	if (req_opcode(req->info.ctrl) == EXPECTED)
				920	req->tidoffset += datalen;
				921	req->sent += data_sent;
				922	if (req->data_len)
				923	iovec->offset += iov_offset;
				924	list_add_tail(&tx->txreq.list, &req->txps);
				925	/*
				926	* It is important to increment this here as it is used to
				927	* generate the BTH.PSN and, therefore, can't be bulk-updated
				928	* outside of the loop.
				929	*/
				930	tx->seqnum = req->seqnum++;
				931	npkts++;
				932	}
				933	dosend:
				934	ret = sdma_send_txlist(req->sde,
				935	iowait_get_ib_work(&pq->busy),
				936	&req->txps, &count);
				937	req->seqsubmitted += count;
				938	if (req->seqsubmitted == req->info.npkts) {
				939	/*
				940	* The txreq has already been submitted to the HW queue
				941	* so we can free the AHG entry now. Corruption will not
				942	* happen due to the sequential manner in which
				943	* descriptors are processed.
				944	*/
				945	if (req->ahg_idx >= 0)
				946	sdma_ahg_free(req->sde, req->ahg_idx);
				947	}
				948	return ret;
				949
				950	free_txreq:
				951	sdma_txclean(pq->dd, &tx->txreq);
				952	free_tx:
				953	kmem_cache_free(pq->txreq_cache, tx);
				954	return ret;
				955	}
				956
				957	static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
				958	{
				959	struct evict_data evict_data;
				960
				961	evict_data.cleared = 0;
				962	evict_data.target = npages;
				963	hfi1_mmu_rb_evict(pq->handler, &evict_data);
				964	return evict_data.cleared;
				965	}
				966
				967	static int pin_sdma_pages(struct user_sdma_request *req,
				968	struct user_sdma_iovec *iovec,
				969	struct sdma_mmu_node *node,
				970	int npages)
				971	{
				972	int pinned, cleared;
				973	struct page **pages;
				974	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				975
				976	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
				977	if (!pages)
				978	return -ENOMEM;
				979	memcpy(pages, node->pages, node->npages * sizeof(*pages));
				980
				981	npages -= node->npages;
				982	retry:
				983	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
				984	atomic_read(&pq->n_locked), npages)) {
				985	cleared = sdma_cache_evict(pq, npages);
				986	if (cleared >= npages)
				987	goto retry;
				988	}
				989	pinned = hfi1_acquire_user_pages(pq->mm,
				990	((unsigned long)iovec->iov.iov_base +
				991	(node->npages * PAGE_SIZE)), npages, 0,
				992	pages + node->npages);
				993	if (pinned < 0) {
				994	kfree(pages);
				995	return pinned;
				996	}
				997	if (pinned != npages) {
				998	unpin_vector_pages(pq->mm, pages, node->npages, pinned);
				999	return -EFAULT;
				1000	}
				1001	kfree(node->pages);
				1002	node->rb.len = iovec->iov.iov_len;
				1003	node->pages = pages;
				1004	atomic_add(pinned, &pq->n_locked);
				1005	return pinned;
				1006	}
				1007
				1008	static void unpin_sdma_pages(struct sdma_mmu_node *node)
				1009	{
				1010	if (node->npages) {
				1011	unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
				1012	atomic_sub(node->npages, &node->pq->n_locked);
				1013	}
				1014	}
				1015
				1016	static int pin_vector_pages(struct user_sdma_request *req,
				1017	struct user_sdma_iovec *iovec)
				1018	{
				1019	int ret = 0, pinned, npages;
				1020	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				1021	struct sdma_mmu_node *node = NULL;
				1022	struct mmu_rb_node *rb_node;
				1023	struct iovec *iov;
				1024	bool extracted;
				1025
				1026	extracted =
				1027	hfi1_mmu_rb_remove_unless_exact(pq->handler,
				1028	(unsigned long)
				1029	iovec->iov.iov_base,
				1030	iovec->iov.iov_len, &rb_node);
				1031	if (rb_node) {
				1032	node = container_of(rb_node, struct sdma_mmu_node, rb);
				1033	if (!extracted) {
				1034	atomic_inc(&node->refcount);
				1035	iovec->pages = node->pages;
				1036	iovec->npages = node->npages;
				1037	iovec->node = node;
				1038	return 0;
				1039	}
				1040	}
				1041
				1042	if (!node) {
				1043	node = kzalloc(sizeof(*node), GFP_KERNEL);
				1044	if (!node)
				1045	return -ENOMEM;
				1046
				1047	node->rb.addr = (unsigned long)iovec->iov.iov_base;
				1048	node->pq = pq;
				1049	atomic_set(&node->refcount, 0);
				1050	}
				1051
				1052	iov = &iovec->iov;
				1053	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
				1054	if (node->npages < npages) {
				1055	pinned = pin_sdma_pages(req, iovec, node, npages);
				1056	if (pinned < 0) {
				1057	ret = pinned;
				1058	goto bail;
				1059	}
				1060	node->npages += pinned;
				1061	npages = node->npages;
				1062	}
				1063	iovec->pages = node->pages;
				1064	iovec->npages = npages;
				1065	iovec->node = node;
				1066
				1067	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
				1068	if (ret) {
				1069	iovec->node = NULL;
				1070	goto bail;
				1071	}
				1072	return 0;
				1073	bail:
				1074	unpin_sdma_pages(node);
				1075	kfree(node);
				1076	return ret;
				1077	}
				1078
				1079	static void unpin_vector_pages(struct mm_struct mm, struct page *pages,
				1080	unsigned start, unsigned npages)
				1081	{
				1082	hfi1_release_user_pages(mm, pages + start, npages, false);
				1083	kfree(pages);
				1084	}
				1085
				1086	static int check_header_template(struct user_sdma_request *req,
				1087	struct hfi1_pkt_header *hdr, u32 lrhlen,
				1088	u32 datalen)
				1089	{
				1090	/*
				1091	* Perform safety checks for any type of packet:
				1092	* - transfer size is multiple of 64bytes
				1093	* - packet length is multiple of 4 bytes
				1094	* - packet length is not larger than MTU size
				1095	*
				1096	* These checks are only done for the first packet of the
				1097	* transfer since the header is "given" to us by user space.
				1098	* For the remainder of the packets we compute the values.
				1099	*/
				1100	if (req->info.fragsize % PIO_BLOCK_SIZE \|\| lrhlen & 0x3 \|\|
				1101	lrhlen > get_lrh_len(*hdr, req->info.fragsize))
				1102	return -EINVAL;
				1103
				1104	if (req_opcode(req->info.ctrl) == EXPECTED) {
				1105	/*
				1106	* The header is checked only on the first packet. Furthermore,
				1107	* we ensure that at least one TID entry is copied when the
				1108	* request is submitted. Therefore, we don't have to verify that
				1109	* tididx points to something sane.
				1110	*/
				1111	u32 tidval = req->tids[req->tididx],
				1112	tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
				1113	tididx = EXP_TID_GET(tidval, IDX),
				1114	tidctrl = EXP_TID_GET(tidval, CTRL),
				1115	tidoff;
				1116	__le32 kval = hdr->kdeth.ver_tid_offset;
				1117
				1118	tidoff = KDETH_GET(kval, OFFSET) *
				1119	(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
				1120	KDETH_OM_LARGE : KDETH_OM_SMALL);
				1121	/*
				1122	* Expected receive packets have the following
				1123	* additional checks:
				1124	* - offset is not larger than the TID size
				1125	* - TIDCtrl values match between header and TID array
				1126	* - TID indexes match between header and TID array
				1127	*/
				1128	if ((tidoff + datalen > tidlen) \|\|
				1129	KDETH_GET(kval, TIDCTRL) != tidctrl \|\|
				1130	KDETH_GET(kval, TID) != tididx)
				1131	return -EINVAL;
				1132	}
				1133	return 0;
				1134	}
				1135
				1136	/*
				1137	* Correctly set the BTH.PSN field based on type of
				1138	* transfer - eager packets can just increment the PSN but
				1139	* expected packets encode generation and sequence in the
				1140	* BTH.PSN field so just incrementing will result in errors.
				1141	*/
				1142	static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
				1143	{
				1144	u32 val = be32_to_cpu(bthpsn),
				1145	mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
				1146	0xffffffull),
				1147	psn = val & mask;
				1148	if (expct)
				1149	psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) \|
				1150	((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
				1151	else
				1152	psn = psn + frags;
				1153	return psn & mask;
				1154	}
				1155
				1156	static int set_txreq_header(struct user_sdma_request *req,
				1157	struct user_sdma_txreq *tx, u32 datalen)
				1158	{
				1159	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				1160	struct hfi1_pkt_header *hdr = &tx->hdr;
				1161	u8 omfactor; /* KDETH.OM */
				1162	u16 pbclen;
				1163	int ret;
				1164	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
				1165
				1166	/* Copy the header template to the request before modification */
				1167	memcpy(hdr, &req->hdr, sizeof(*hdr));
				1168
				1169	/*
				1170	* Check if the PBC and LRH length are mismatched. If so
				1171	* adjust both in the header.
				1172	*/
				1173	pbclen = le16_to_cpu(hdr->pbc[0]);
				1174	if (PBC2LRH(pbclen) != lrhlen) {
				1175	pbclen = (pbclen & 0xf000) \| LRH2PBC(lrhlen);
				1176	hdr->pbc[0] = cpu_to_le16(pbclen);
				1177	hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
				1178	/*
				1179	* Third packet
				1180	* This is the first packet in the sequence that has
				1181	* a "static" size that can be used for the rest of
				1182	* the packets (besides the last one).
				1183	*/
				1184	if (unlikely(req->seqnum == 2)) {
				1185	/*
				1186	* From this point on the lengths in both the
				1187	* PBC and LRH are the same until the last
				1188	* packet.
				1189	* Adjust the template so we don't have to update
				1190	* every packet
				1191	*/
				1192	req->hdr.pbc[0] = hdr->pbc[0];
				1193	req->hdr.lrh[2] = hdr->lrh[2];
				1194	}
				1195	}
				1196	/*
				1197	* We only have to modify the header if this is not the
				1198	* first packet in the request. Otherwise, we use the
				1199	* header given to us.
				1200	*/
				1201	if (unlikely(!req->seqnum)) {
				1202	ret = check_header_template(req, hdr, lrhlen, datalen);
				1203	if (ret)
				1204	return ret;
				1205	goto done;
				1206	}
				1207
				1208	hdr->bth[2] = cpu_to_be32(
				1209	set_pkt_bth_psn(hdr->bth[2],
				1210	(req_opcode(req->info.ctrl) == EXPECTED),
				1211	req->seqnum));
				1212
				1213	/* Set ACK request on last packet */
				1214	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
				1215	hdr->bth[2] \|= cpu_to_be32(1UL << 31);
				1216
				1217	/* Set the new offset */
				1218	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
				1219	/* Expected packets have to fill in the new TID information */
				1220	if (req_opcode(req->info.ctrl) == EXPECTED) {
				1221	tidval = req->tids[req->tididx];
				1222	/*
				1223	* If the offset puts us at the end of the current TID,
				1224	* advance everything.
				1225	*/
				1226	if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
				1227	PAGE_SIZE)) {
				1228	req->tidoffset = 0;
				1229	/*
				1230	* Since we don't copy all the TIDs, all at once,
				1231	* we have to check again.
				1232	*/
				1233	if (++req->tididx > req->n_tids - 1 \|\|
				1234	!req->tids[req->tididx]) {
				1235	return -EINVAL;
				1236	}
				1237	tidval = req->tids[req->tididx];
				1238	}
				1239	omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
				1240	KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
				1241	KDETH_OM_SMALL_SHIFT;
				1242	/* Set KDETH.TIDCtrl based on value for this TID. */
				1243	KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
				1244	EXP_TID_GET(tidval, CTRL));
				1245	/* Set KDETH.TID based on value for this TID */
				1246	KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
				1247	EXP_TID_GET(tidval, IDX));
				1248	/* Clear KDETH.SH when DISABLE_SH flag is set */
				1249	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
				1250	KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
				1251	/*
				1252	* Set the KDETH.OFFSET and KDETH.OM based on size of
				1253	* transfer.
				1254	*/
				1255	trace_hfi1_sdma_user_tid_info(
				1256	pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
				1257	req->tidoffset, req->tidoffset >> omfactor,
				1258	omfactor != KDETH_OM_SMALL_SHIFT);
				1259	KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
				1260	req->tidoffset >> omfactor);
				1261	KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
				1262	omfactor != KDETH_OM_SMALL_SHIFT);
				1263	}
				1264	done:
				1265	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
				1266	req->info.comp_idx, hdr, tidval);
				1267	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
				1268	}
				1269
				1270	static int set_txreq_header_ahg(struct user_sdma_request *req,
				1271	struct user_sdma_txreq *tx, u32 datalen)
				1272	{
				1273	u32 ahg[AHG_KDETH_ARRAY_SIZE];
				1274	int idx = 0;
				1275	u8 omfactor; /* KDETH.OM */
				1276	struct hfi1_user_sdma_pkt_q *pq = req->pq;
				1277	struct hfi1_pkt_header *hdr = &req->hdr;
				1278	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
				1279	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
				1280	size_t array_size = ARRAY_SIZE(ahg);
				1281
				1282	if (PBC2LRH(pbclen) != lrhlen) {
				1283	/* PBC.PbcLengthDWs */
				1284	idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
				1285	(__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
				1286	if (idx < 0)
				1287	return idx;
				1288	/* LRH.PktLen (we need the full 16 bits due to byte swap) */
				1289	idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
				1290	(__force u16)cpu_to_be16(lrhlen >> 2));
				1291	if (idx < 0)
				1292	return idx;
				1293	}
				1294
				1295	/*
				1296	* Do the common updates
				1297	*/
				1298	/* BTH.PSN and BTH.A */
				1299	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
				1300	(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
				1301	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
				1302	val32 \|= 1UL << 31;
				1303	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
				1304	(__force u16)cpu_to_be16(val32 >> 16));
				1305	if (idx < 0)
				1306	return idx;
				1307	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
				1308	(__force u16)cpu_to_be16(val32 & 0xffff));
				1309	if (idx < 0)
				1310	return idx;
				1311	/* KDETH.Offset */
				1312	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
				1313	(__force u16)cpu_to_le16(req->koffset & 0xffff));
				1314	if (idx < 0)
				1315	return idx;
				1316	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
				1317	(__force u16)cpu_to_le16(req->koffset >> 16));
				1318	if (idx < 0)
				1319	return idx;
				1320	if (req_opcode(req->info.ctrl) == EXPECTED) {
				1321	__le16 val;
				1322
				1323	tidval = req->tids[req->tididx];
				1324
				1325	/*
				1326	* If the offset puts us at the end of the current TID,
				1327	* advance everything.
				1328	*/
				1329	if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
				1330	PAGE_SIZE)) {
				1331	req->tidoffset = 0;
				1332	/*
				1333	* Since we don't copy all the TIDs, all at once,
				1334	* we have to check again.
				1335	*/
				1336	if (++req->tididx > req->n_tids - 1 \|\|
				1337	!req->tids[req->tididx])
				1338	return -EINVAL;
				1339	tidval = req->tids[req->tididx];
				1340	}
				1341	omfactor = ((EXP_TID_GET(tidval, LEN) *
				1342	PAGE_SIZE) >=
				1343	KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
				1344	KDETH_OM_SMALL_SHIFT;
				1345	/* KDETH.OM and KDETH.OFFSET (TID) */
				1346	idx = ahg_header_set(
				1347	ahg, idx, array_size, 7, 0, 16,
				1348	((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 \|
				1349	((req->tidoffset >> omfactor)
				1350	& 0x7fff)));
				1351	if (idx < 0)
				1352	return idx;
				1353	/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
				1354	val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) \|
				1355	(EXP_TID_GET(tidval, IDX) & 0x3ff));
				1356
				1357	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
				1358	val \|= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
				1359	INTR) <<
				1360	AHG_KDETH_INTR_SHIFT));
				1361	} else {
				1362	val \|= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
				1363	cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
				1364	cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
				1365	INTR) <<
				1366	AHG_KDETH_INTR_SHIFT));
				1367	}
				1368
				1369	idx = ahg_header_set(ahg, idx, array_size,
				1370	7, 16, 14, (__force u16)val);
				1371	if (idx < 0)
				1372	return idx;
				1373	}
				1374
				1375	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
				1376	req->info.comp_idx, req->sde->this_idx,
				1377	req->ahg_idx, ahg, idx, tidval);
				1378	sdma_txinit_ahg(&tx->txreq,
				1379	SDMA_TXREQ_F_USE_AHG,
				1380	datalen, req->ahg_idx, idx,
				1381	ahg, sizeof(req->hdr),
				1382	user_sdma_txreq_cb);
				1383
				1384	return idx;
				1385	}
				1386
				1387	/**
				1388	* user_sdma_txreq_cb() - SDMA tx request completion callback.
				1389	* @txreq: valid sdma tx request
				1390	* @status: success/failure of request
				1391	*
				1392	* Called when the SDMA progress state machine gets notification that
				1393	* the SDMA descriptors for this tx request have been processed by the
				1394	* DMA engine. Called in interrupt context.
				1395	* Only do work on completed sequences.
				1396	*/
				1397	static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
				1398	{
				1399	struct user_sdma_txreq *tx =
				1400	container_of(txreq, struct user_sdma_txreq, txreq);
				1401	struct user_sdma_request *req;
				1402	struct hfi1_user_sdma_pkt_q *pq;
				1403	struct hfi1_user_sdma_comp_q *cq;
				1404	enum hfi1_sdma_comp_state state = COMPLETE;
				1405
				1406	if (!tx->req)
				1407	return;
				1408
				1409	req = tx->req;
				1410	pq = req->pq;
				1411	cq = req->cq;
				1412
				1413	if (status != SDMA_TXREQ_S_OK) {
				1414	SDMA_DBG(req, "SDMA completion with error %d",
				1415	status);
				1416	WRITE_ONCE(req->has_error, 1);
				1417	state = ERROR;
				1418	}
				1419
				1420	req->seqcomp = tx->seqnum;
				1421	kmem_cache_free(pq->txreq_cache, tx);
				1422
				1423	/* sequence isn't complete? We are done */
				1424	if (req->seqcomp != req->info.npkts - 1)
				1425	return;
				1426
				1427	user_sdma_free_request(req, false);
				1428	set_comp_state(pq, cq, req->info.comp_idx, state, status);
				1429	pq_update(pq);
				1430	}
				1431
				1432	static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
				1433	{
				1434	if (atomic_dec_and_test(&pq->n_reqs))
				1435	wake_up(&pq->wait);
				1436	}
				1437
				1438	static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
				1439	{
				1440	int i;
				1441
				1442	if (!list_empty(&req->txps)) {
				1443	struct sdma_txreq t, p;
				1444
				1445	list_for_each_entry_safe(t, p, &req->txps, list) {
				1446	struct user_sdma_txreq *tx =
				1447	container_of(t, struct user_sdma_txreq, txreq);
				1448	list_del_init(&t->list);
				1449	sdma_txclean(req->pq->dd, t);
				1450	kmem_cache_free(req->pq->txreq_cache, tx);
				1451	}
				1452	}
				1453
				1454	for (i = 0; i < req->data_iovs; i++) {
				1455	struct sdma_mmu_node *node = req->iovs[i].node;
				1456
				1457	if (!node)
				1458	continue;
				1459
				1460	req->iovs[i].node = NULL;
				1461
				1462	if (unpin)
				1463	hfi1_mmu_rb_remove(req->pq->handler,
				1464	&node->rb);
				1465	else
				1466	atomic_dec(&node->refcount);
				1467	}
				1468
				1469	kfree(req->tids);
				1470	clear_bit(req->info.comp_idx, req->pq->req_in_use);
				1471	}
				1472
				1473	static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
				1474	struct hfi1_user_sdma_comp_q *cq,
				1475	u16 idx, enum hfi1_sdma_comp_state state,
				1476	int ret)
				1477	{
				1478	if (state == ERROR)
				1479	cq->comps[idx].errcode = -ret;
				1480	smp_wmb(); /* make sure errcode is visible first */
				1481	cq->comps[idx].status = state;
				1482	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
				1483	idx, state, ret);
				1484	}
				1485
				1486	static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
				1487	unsigned long len)
				1488	{
				1489	return (bool)(node->addr == addr);
				1490	}
				1491
				1492	static int sdma_rb_insert(void arg, struct mmu_rb_node mnode)
				1493	{
				1494	struct sdma_mmu_node *node =
				1495	container_of(mnode, struct sdma_mmu_node, rb);
				1496
				1497	atomic_inc(&node->refcount);
				1498	return 0;
				1499	}
				1500
				1501	/*
				1502	* Return 1 to remove the node from the rb tree and call the remove op.
				1503	*
				1504	* Called with the rb tree lock held.
				1505	*/
				1506	static int sdma_rb_evict(void arg, struct mmu_rb_node mnode,
				1507	void evict_arg, bool stop)
				1508	{
				1509	struct sdma_mmu_node *node =
				1510	container_of(mnode, struct sdma_mmu_node, rb);
				1511	struct evict_data *evict_data = evict_arg;
				1512
				1513	/* is this node still being used? */
				1514	if (atomic_read(&node->refcount))
				1515	return 0; /* keep this node */
				1516
				1517	/* this node will be evicted, add its pages to our count */
				1518	evict_data->cleared += node->npages;
				1519
				1520	/* have enough pages been cleared? */
				1521	if (evict_data->cleared >= evict_data->target)
				1522	*stop = true;
				1523
				1524	return 1; /* remove this node */
				1525	}
				1526
				1527	static void sdma_rb_remove(void arg, struct mmu_rb_node mnode)
				1528	{
				1529	struct sdma_mmu_node *node =
				1530	container_of(mnode, struct sdma_mmu_node, rb);
				1531
				1532	unpin_sdma_pages(node);
				1533	kfree(node);
				1534	}
				1535
				1536	static int sdma_rb_invalidate(void arg, struct mmu_rb_node mnode)
				1537	{
				1538	struct sdma_mmu_node *node =
				1539	container_of(mnode, struct sdma_mmu_node, rb);
				1540
				1541	if (!atomic_read(&node->refcount))
				1542	return 1;
				1543	return 0;
				1544	}