Blame - marvell/linux/drivers/infiniband/hw/hfi1/affinity.c - T108

blob: 832b878fa67eba1c8692dc3421234291d9ebac13 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Copyright(c) 2015 - 2018 Intel Corporation.
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of version 2 of the GNU General Public License as
				11	* published by the Free Software Foundation.
				12	*
				13	* This program is distributed in the hope that it will be useful, but
				14	* WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				16	* General Public License for more details.
				17	*
				18	* BSD LICENSE
				19	*
				20	* Redistribution and use in source and binary forms, with or without
				21	* modification, are permitted provided that the following conditions
				22	* are met:
				23	*
				24	* - Redistributions of source code must retain the above copyright
				25	* notice, this list of conditions and the following disclaimer.
				26	* - Redistributions in binary form must reproduce the above copyright
				27	* notice, this list of conditions and the following disclaimer in
				28	* the documentation and/or other materials provided with the
				29	* distribution.
				30	* - Neither the name of Intel Corporation nor the names of its
				31	* contributors may be used to endorse or promote products derived
				32	* from this software without specific prior written permission.
				33	*
				34	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				35	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				36	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				37	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				38	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				39	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				40	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				41	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				42	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				43	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				44	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				45	*
				46	*/
				47	#include <linux/topology.h>
				48	#include <linux/cpumask.h>
				49	#include <linux/module.h>
				50	#include <linux/interrupt.h>
				51	#include <linux/numa.h>
				52
				53	#include "hfi.h"
				54	#include "affinity.h"
				55	#include "sdma.h"
				56	#include "trace.h"
				57
				58	struct hfi1_affinity_node_list node_affinity = {
				59	.list = LIST_HEAD_INIT(node_affinity.list),
				60	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
				61	};
				62
				63	/* Name of IRQ types, indexed by enum irq_type */
				64	static const char * const irq_type_names[] = {
				65	"SDMA",
				66	"RCVCTXT",
				67	"GENERAL",
				68	"OTHER",
				69	};
				70
				71	/* Per NUMA node count of HFI devices */
				72	static unsigned int *hfi1_per_node_cntr;
				73
				74	static inline void init_cpu_mask_set(struct cpu_mask_set *set)
				75	{
				76	cpumask_clear(&set->mask);
				77	cpumask_clear(&set->used);
				78	set->gen = 0;
				79	}
				80
				81	/* Increment generation of CPU set if needed */
				82	static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
				83	{
				84	if (cpumask_equal(&set->mask, &set->used)) {
				85	/*
				86	* We've used up all the CPUs, bump up the generation
				87	* and reset the 'used' map
				88	*/
				89	set->gen++;
				90	cpumask_clear(&set->used);
				91	}
				92	}
				93
				94	static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
				95	{
				96	if (cpumask_empty(&set->used) && set->gen) {
				97	set->gen--;
				98	cpumask_copy(&set->used, &set->mask);
				99	}
				100	}
				101
				102	/* Get the first CPU from the list of unused CPUs in a CPU set data structure */
				103	static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
				104	{
				105	int cpu;
				106
				107	if (!diff \|\| !set)
				108	return -EINVAL;
				109
				110	_cpu_mask_set_gen_inc(set);
				111
				112	/* Find out CPUs left in CPU mask */
				113	cpumask_andnot(diff, &set->mask, &set->used);
				114
				115	cpu = cpumask_first(diff);
				116	if (cpu >= nr_cpu_ids) /* empty */
				117	cpu = -EINVAL;
				118	else
				119	cpumask_set_cpu(cpu, &set->used);
				120
				121	return cpu;
				122	}
				123
				124	static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
				125	{
				126	if (!set)
				127	return;
				128
				129	cpumask_clear_cpu(cpu, &set->used);
				130	_cpu_mask_set_gen_dec(set);
				131	}
				132
				133	/* Initialize non-HT cpu cores mask */
				134	void init_real_cpu_mask(void)
				135	{
				136	int possible, curr_cpu, i, ht;
				137
				138	cpumask_clear(&node_affinity.real_cpu_mask);
				139
				140	/* Start with cpu online mask as the real cpu mask */
				141	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
				142
				143	/*
				144	* Remove HT cores from the real cpu mask. Do this in two steps below.
				145	*/
				146	possible = cpumask_weight(&node_affinity.real_cpu_mask);
				147	ht = cpumask_weight(topology_sibling_cpumask(
				148	cpumask_first(&node_affinity.real_cpu_mask)));
				149	/*
				150	* Step 1. Skip over the first N HT siblings and use them as the
				151	* "real" cores. Assumes that HT cores are not enumerated in
				152	* succession (except in the single core case).
				153	*/
				154	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
				155	for (i = 0; i < possible / ht; i++)
				156	curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
				157	/*
				158	* Step 2. Remove the remaining HT siblings. Use cpumask_next() to
				159	* skip any gaps.
				160	*/
				161	for (; i < possible; i++) {
				162	cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
				163	curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
				164	}
				165	}
				166
				167	int node_affinity_init(void)
				168	{
				169	int node;
				170	struct pci_dev *dev = NULL;
				171	const struct pci_device_id *ids = hfi1_pci_tbl;
				172
				173	cpumask_clear(&node_affinity.proc.used);
				174	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
				175
				176	node_affinity.proc.gen = 0;
				177	node_affinity.num_core_siblings =
				178	cpumask_weight(topology_sibling_cpumask(
				179	cpumask_first(&node_affinity.proc.mask)
				180	));
				181	node_affinity.num_possible_nodes = num_possible_nodes();
				182	node_affinity.num_online_nodes = num_online_nodes();
				183	node_affinity.num_online_cpus = num_online_cpus();
				184
				185	/*
				186	* The real cpu mask is part of the affinity struct but it has to be
				187	* initialized early. It is needed to calculate the number of user
				188	* contexts in set_up_context_variables().
				189	*/
				190	init_real_cpu_mask();
				191
				192	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
				193	sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
				194	if (!hfi1_per_node_cntr)
				195	return -ENOMEM;
				196
				197	while (ids->vendor) {
				198	dev = NULL;
				199	while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
				200	node = pcibus_to_node(dev->bus);
				201	if (node < 0)
				202	goto out;
				203
				204	hfi1_per_node_cntr[node]++;
				205	}
				206	ids++;
				207	}
				208
				209	return 0;
				210
				211	out:
				212	/*
				213	* Invalid PCI NUMA node information found, note it, and populate
				214	* our database 1:1.
				215	*/
				216	pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
				217	pr_err("HFI: System BIOS may need to be upgraded\n");
				218	for (node = 0; node < node_affinity.num_possible_nodes; node++)
				219	hfi1_per_node_cntr[node] = 1;
				220
				221	pci_dev_put(dev);
				222
				223	return 0;
				224	}
				225
				226	static void node_affinity_destroy(struct hfi1_affinity_node *entry)
				227	{
				228	free_percpu(entry->comp_vect_affinity);
				229	kfree(entry);
				230	}
				231
				232	void node_affinity_destroy_all(void)
				233	{
				234	struct list_head pos, q;
				235	struct hfi1_affinity_node *entry;
				236
				237	mutex_lock(&node_affinity.lock);
				238	list_for_each_safe(pos, q, &node_affinity.list) {
				239	entry = list_entry(pos, struct hfi1_affinity_node,
				240	list);
				241	list_del(pos);
				242	node_affinity_destroy(entry);
				243	}
				244	mutex_unlock(&node_affinity.lock);
				245	kfree(hfi1_per_node_cntr);
				246	}
				247
				248	static struct hfi1_affinity_node *node_affinity_allocate(int node)
				249	{
				250	struct hfi1_affinity_node *entry;
				251
				252	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
				253	if (!entry)
				254	return NULL;
				255	entry->node = node;
				256	entry->comp_vect_affinity = alloc_percpu(u16);
				257	INIT_LIST_HEAD(&entry->list);
				258
				259	return entry;
				260	}
				261
				262	/*
				263	* It appends an entry to the list.
				264	* It must be called with node_affinity.lock held.
				265	*/
				266	static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
				267	{
				268	list_add_tail(&entry->list, &node_affinity.list);
				269	}
				270
				271	/* It must be called with node_affinity.lock held */
				272	static struct hfi1_affinity_node *node_affinity_lookup(int node)
				273	{
				274	struct list_head *pos;
				275	struct hfi1_affinity_node *entry;
				276
				277	list_for_each(pos, &node_affinity.list) {
				278	entry = list_entry(pos, struct hfi1_affinity_node, list);
				279	if (entry->node == node)
				280	return entry;
				281	}
				282
				283	return NULL;
				284	}
				285
				286	static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
				287	u16 __percpu *comp_vect_affinity)
				288	{
				289	int curr_cpu;
				290	u16 cntr;
				291	u16 prev_cntr;
				292	int ret_cpu;
				293
				294	if (!possible_cpumask) {
				295	ret_cpu = -EINVAL;
				296	goto fail;
				297	}
				298
				299	if (!comp_vect_affinity) {
				300	ret_cpu = -EINVAL;
				301	goto fail;
				302	}
				303
				304	ret_cpu = cpumask_first(possible_cpumask);
				305	if (ret_cpu >= nr_cpu_ids) {
				306	ret_cpu = -EINVAL;
				307	goto fail;
				308	}
				309
				310	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
				311	for_each_cpu(curr_cpu, possible_cpumask) {
				312	cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
				313
				314	if (cntr < prev_cntr) {
				315	ret_cpu = curr_cpu;
				316	prev_cntr = cntr;
				317	}
				318	}
				319
				320	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
				321
				322	fail:
				323	return ret_cpu;
				324	}
				325
				326	static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
				327	u16 __percpu *comp_vect_affinity)
				328	{
				329	int curr_cpu;
				330	int max_cpu;
				331	u16 cntr;
				332	u16 prev_cntr;
				333
				334	if (!possible_cpumask)
				335	return -EINVAL;
				336
				337	if (!comp_vect_affinity)
				338	return -EINVAL;
				339
				340	max_cpu = cpumask_first(possible_cpumask);
				341	if (max_cpu >= nr_cpu_ids)
				342	return -EINVAL;
				343
				344	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
				345	for_each_cpu(curr_cpu, possible_cpumask) {
				346	cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
				347
				348	if (cntr > prev_cntr) {
				349	max_cpu = curr_cpu;
				350	prev_cntr = cntr;
				351	}
				352	}
				353
				354	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
				355
				356	return max_cpu;
				357	}
				358
				359	/*
				360	* Non-interrupt CPUs are used first, then interrupt CPUs.
				361	* Two already allocated cpu masks must be passed.
				362	*/
				363	static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
				364	struct hfi1_affinity_node *entry,
				365	cpumask_var_t non_intr_cpus,
				366	cpumask_var_t available_cpus)
				367	__must_hold(&node_affinity.lock)
				368	{
				369	int cpu;
				370	struct cpu_mask_set *set = dd->comp_vect;
				371
				372	lockdep_assert_held(&node_affinity.lock);
				373	if (!non_intr_cpus) {
				374	cpu = -1;
				375	goto fail;
				376	}
				377
				378	if (!available_cpus) {
				379	cpu = -1;
				380	goto fail;
				381	}
				382
				383	/* Available CPUs for pinning completion vectors */
				384	_cpu_mask_set_gen_inc(set);
				385	cpumask_andnot(available_cpus, &set->mask, &set->used);
				386
				387	/* Available CPUs without SDMA engine interrupts */
				388	cpumask_andnot(non_intr_cpus, available_cpus,
				389	&entry->def_intr.used);
				390
				391	/* If there are non-interrupt CPUs available, use them first */
				392	if (!cpumask_empty(non_intr_cpus))
				393	cpu = cpumask_first(non_intr_cpus);
				394	else /* Otherwise, use interrupt CPUs */
				395	cpu = cpumask_first(available_cpus);
				396
				397	if (cpu >= nr_cpu_ids) { /* empty */
				398	cpu = -1;
				399	goto fail;
				400	}
				401	cpumask_set_cpu(cpu, &set->used);
				402
				403	fail:
				404	return cpu;
				405	}
				406
				407	static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
				408	{
				409	struct cpu_mask_set *set = dd->comp_vect;
				410
				411	if (cpu < 0)
				412	return;
				413
				414	cpu_mask_set_put(set, cpu);
				415	}
				416
				417	/* _dev_comp_vect_mappings_destroy() is reentrant */
				418	static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
				419	{
				420	int i, cpu;
				421
				422	if (!dd->comp_vect_mappings)
				423	return;
				424
				425	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				426	cpu = dd->comp_vect_mappings[i];
				427	_dev_comp_vect_cpu_put(dd, cpu);
				428	dd->comp_vect_mappings[i] = -1;
				429	hfi1_cdbg(AFFINITY,
				430	"[%s] Release CPU %d from completion vector %d",
				431	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
				432	}
				433
				434	kfree(dd->comp_vect_mappings);
				435	dd->comp_vect_mappings = NULL;
				436	}
				437
				438	/*
				439	* This function creates the table for looking up CPUs for completion vectors.
				440	* num_comp_vectors needs to have been initilized before calling this function.
				441	*/
				442	static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
				443	struct hfi1_affinity_node *entry)
				444	__must_hold(&node_affinity.lock)
				445	{
				446	int i, cpu, ret;
				447	cpumask_var_t non_intr_cpus;
				448	cpumask_var_t available_cpus;
				449
				450	lockdep_assert_held(&node_affinity.lock);
				451
				452	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
				453	return -ENOMEM;
				454
				455	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
				456	free_cpumask_var(non_intr_cpus);
				457	return -ENOMEM;
				458	}
				459
				460	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
				461	sizeof(*dd->comp_vect_mappings),
				462	GFP_KERNEL);
				463	if (!dd->comp_vect_mappings) {
				464	ret = -ENOMEM;
				465	goto fail;
				466	}
				467	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
				468	dd->comp_vect_mappings[i] = -1;
				469
				470	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				471	cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
				472	available_cpus);
				473	if (cpu < 0) {
				474	ret = -EINVAL;
				475	goto fail;
				476	}
				477
				478	dd->comp_vect_mappings[i] = cpu;
				479	hfi1_cdbg(AFFINITY,
				480	"[%s] Completion Vector %d -> CPU %d",
				481	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
				482	}
				483
				484	free_cpumask_var(available_cpus);
				485	free_cpumask_var(non_intr_cpus);
				486	return 0;
				487
				488	fail:
				489	free_cpumask_var(available_cpus);
				490	free_cpumask_var(non_intr_cpus);
				491	_dev_comp_vect_mappings_destroy(dd);
				492
				493	return ret;
				494	}
				495
				496	int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
				497	{
				498	int ret;
				499	struct hfi1_affinity_node *entry;
				500
				501	mutex_lock(&node_affinity.lock);
				502	entry = node_affinity_lookup(dd->node);
				503	if (!entry) {
				504	ret = -EINVAL;
				505	goto unlock;
				506	}
				507	ret = _dev_comp_vect_mappings_create(dd, entry);
				508	unlock:
				509	mutex_unlock(&node_affinity.lock);
				510
				511	return ret;
				512	}
				513
				514	void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
				515	{
				516	_dev_comp_vect_mappings_destroy(dd);
				517	}
				518
				519	int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
				520	{
				521	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
				522	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
				523
				524	if (!dd->comp_vect_mappings)
				525	return -EINVAL;
				526	if (comp_vect >= dd->comp_vect_possible_cpus)
				527	return -EINVAL;
				528
				529	return dd->comp_vect_mappings[comp_vect];
				530	}
				531
				532	/*
				533	* It assumes dd->comp_vect_possible_cpus is available.
				534	*/
				535	static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
				536	struct hfi1_affinity_node *entry,
				537	bool first_dev_init)
				538	__must_hold(&node_affinity.lock)
				539	{
				540	int i, j, curr_cpu;
				541	int possible_cpus_comp_vect = 0;
				542	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
				543
				544	lockdep_assert_held(&node_affinity.lock);
				545	/*
				546	* If there's only one CPU available for completion vectors, then
				547	* there will only be one completion vector available. Othewise,
				548	* the number of completion vector available will be the number of
				549	* available CPUs divide it by the number of devices in the
				550	* local NUMA node.
				551	*/
				552	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
				553	possible_cpus_comp_vect = 1;
				554	dd_dev_warn(dd,
				555	"Number of kernel receive queues is too large for completion vector affinity to be effective\n");
				556	} else {
				557	possible_cpus_comp_vect +=
				558	cpumask_weight(&entry->comp_vect_mask) /
				559	hfi1_per_node_cntr[dd->node];
				560
				561	/*
				562	* If the completion vector CPUs available doesn't divide
				563	* evenly among devices, then the first device device to be
				564	* initialized gets an extra CPU.
				565	*/
				566	if (first_dev_init &&
				567	cpumask_weight(&entry->comp_vect_mask) %
				568	hfi1_per_node_cntr[dd->node] != 0)
				569	possible_cpus_comp_vect++;
				570	}
				571
				572	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
				573
				574	/* Reserving CPUs for device completion vector */
				575	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				576	curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
				577	entry->comp_vect_affinity);
				578	if (curr_cpu < 0)
				579	goto fail;
				580
				581	cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
				582	}
				583
				584	hfi1_cdbg(AFFINITY,
				585	"[%s] Completion vector affinity CPU set(s) %*pbl",
				586	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
				587	cpumask_pr_args(dev_comp_vect_mask));
				588
				589	return 0;
				590
				591	fail:
				592	for (j = 0; j < i; j++)
				593	per_cpu_affinity_put_max(&entry->comp_vect_mask,
				594	entry->comp_vect_affinity);
				595
				596	return curr_cpu;
				597	}
				598
				599	/*
				600	* It assumes dd->comp_vect_possible_cpus is available.
				601	*/
				602	static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
				603	struct hfi1_affinity_node *entry)
				604	__must_hold(&node_affinity.lock)
				605	{
				606	int i, cpu;
				607
				608	lockdep_assert_held(&node_affinity.lock);
				609	if (!dd->comp_vect_possible_cpus)
				610	return;
				611
				612	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				613	cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
				614	entry->comp_vect_affinity);
				615	/* Clearing CPU in device completion vector cpu mask */
				616	if (cpu >= 0)
				617	cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
				618	}
				619
				620	dd->comp_vect_possible_cpus = 0;
				621	}
				622
				623	/*
				624	* Interrupt affinity.
				625	*
				626	* non-rcv avail gets a default mask that
				627	* starts as possible cpus with threads reset
				628	* and each rcv avail reset.
				629	*
				630	* rcv avail gets node relative 1 wrapping back
				631	* to the node relative 1 as necessary.
				632	*
				633	*/
				634	int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
				635	{
				636	int node = pcibus_to_node(dd->pcidev->bus);
				637	struct hfi1_affinity_node *entry;
				638	const struct cpumask *local_mask;
				639	int curr_cpu, possible, i, ret;
				640	bool new_entry = false;
				641
				642	/*
				643	* If the BIOS does not have the NUMA node information set, select
				644	* NUMA 0 so we get consistent performance.
				645	*/
				646	if (node < 0) {
				647	dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
				648	node = 0;
				649	}
				650	dd->node = node;
				651
				652	local_mask = cpumask_of_node(dd->node);
				653	if (cpumask_first(local_mask) >= nr_cpu_ids)
				654	local_mask = topology_core_cpumask(0);
				655
				656	mutex_lock(&node_affinity.lock);
				657	entry = node_affinity_lookup(dd->node);
				658
				659	/*
				660	* If this is the first time this NUMA node's affinity is used,
				661	* create an entry in the global affinity structure and initialize it.
				662	*/
				663	if (!entry) {
				664	entry = node_affinity_allocate(node);
				665	if (!entry) {
				666	dd_dev_err(dd,
				667	"Unable to allocate global affinity node\n");
				668	ret = -ENOMEM;
				669	goto fail;
				670	}
				671	new_entry = true;
				672
				673	init_cpu_mask_set(&entry->def_intr);
				674	init_cpu_mask_set(&entry->rcv_intr);
				675	cpumask_clear(&entry->comp_vect_mask);
				676	cpumask_clear(&entry->general_intr_mask);
				677	/* Use the "real" cpu mask of this node as the default */
				678	cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
				679	local_mask);
				680
				681	/* fill in the receive list */
				682	possible = cpumask_weight(&entry->def_intr.mask);
				683	curr_cpu = cpumask_first(&entry->def_intr.mask);
				684
				685	if (possible == 1) {
				686	/* only one CPU, everyone will use it */
				687	cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
				688	cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
				689	} else {
				690	/*
				691	* The general/control context will be the first CPU in
				692	* the default list, so it is removed from the default
				693	* list and added to the general interrupt list.
				694	*/
				695	cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
				696	cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
				697	curr_cpu = cpumask_next(curr_cpu,
				698	&entry->def_intr.mask);
				699
				700	/*
				701	* Remove the remaining kernel receive queues from
				702	* the default list and add them to the receive list.
				703	*/
				704	for (i = 0;
				705	i < (dd->n_krcv_queues - 1) *
				706	hfi1_per_node_cntr[dd->node];
				707	i++) {
				708	cpumask_clear_cpu(curr_cpu,
				709	&entry->def_intr.mask);
				710	cpumask_set_cpu(curr_cpu,
				711	&entry->rcv_intr.mask);
				712	curr_cpu = cpumask_next(curr_cpu,
				713	&entry->def_intr.mask);
				714	if (curr_cpu >= nr_cpu_ids)
				715	break;
				716	}
				717
				718	/*
				719	* If there ends up being 0 CPU cores leftover for SDMA
				720	* engines, use the same CPU cores as general/control
				721	* context.
				722	*/
				723	if (cpumask_weight(&entry->def_intr.mask) == 0)
				724	cpumask_copy(&entry->def_intr.mask,
				725	&entry->general_intr_mask);
				726	}
				727
				728	/* Determine completion vector CPUs for the entire node */
				729	cpumask_and(&entry->comp_vect_mask,
				730	&node_affinity.real_cpu_mask, local_mask);
				731	cpumask_andnot(&entry->comp_vect_mask,
				732	&entry->comp_vect_mask,
				733	&entry->rcv_intr.mask);
				734	cpumask_andnot(&entry->comp_vect_mask,
				735	&entry->comp_vect_mask,
				736	&entry->general_intr_mask);
				737
				738	/*
				739	* If there ends up being 0 CPU cores leftover for completion
				740	* vectors, use the same CPU core as the general/control
				741	* context.
				742	*/
				743	if (cpumask_weight(&entry->comp_vect_mask) == 0)
				744	cpumask_copy(&entry->comp_vect_mask,
				745	&entry->general_intr_mask);
				746	}
				747
				748	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
				749	if (ret < 0)
				750	goto fail;
				751
				752	if (new_entry)
				753	node_affinity_add_tail(entry);
				754
				755	mutex_unlock(&node_affinity.lock);
				756
				757	return 0;
				758
				759	fail:
				760	if (new_entry)
				761	node_affinity_destroy(entry);
				762	mutex_unlock(&node_affinity.lock);
				763	return ret;
				764	}
				765
				766	void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
				767	{
				768	struct hfi1_affinity_node *entry;
				769
				770	if (dd->node < 0)
				771	return;
				772
				773	mutex_lock(&node_affinity.lock);
				774	entry = node_affinity_lookup(dd->node);
				775	if (!entry)
				776	goto unlock;
				777
				778	/*
				779	* Free device completion vector CPUs to be used by future
				780	* completion vectors
				781	*/
				782	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
				783	unlock:
				784	mutex_unlock(&node_affinity.lock);
				785	dd->node = NUMA_NO_NODE;
				786	}
				787
				788	/*
				789	* Function updates the irq affinity hint for msix after it has been changed
				790	* by the user using the /proc/irq interface. This function only accepts
				791	* one cpu in the mask.
				792	*/
				793	static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
				794	{
				795	struct sdma_engine *sde = msix->arg;
				796	struct hfi1_devdata *dd = sde->dd;
				797	struct hfi1_affinity_node *entry;
				798	struct cpu_mask_set *set;
				799	int i, old_cpu;
				800
				801	if (cpu > num_online_cpus() \|\| cpu == sde->cpu)
				802	return;
				803
				804	mutex_lock(&node_affinity.lock);
				805	entry = node_affinity_lookup(dd->node);
				806	if (!entry)
				807	goto unlock;
				808
				809	old_cpu = sde->cpu;
				810	sde->cpu = cpu;
				811	cpumask_clear(&msix->mask);
				812	cpumask_set_cpu(cpu, &msix->mask);
				813	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
				814	msix->irq, irq_type_names[msix->type],
				815	sde->this_idx, cpu);
				816	irq_set_affinity_hint(msix->irq, &msix->mask);
				817
				818	/*
				819	* Set the new cpu in the hfi1_affinity_node and clean
				820	* the old cpu if it is not used by any other IRQ
				821	*/
				822	set = &entry->def_intr;
				823	cpumask_set_cpu(cpu, &set->mask);
				824	cpumask_set_cpu(cpu, &set->used);
				825	for (i = 0; i < dd->msix_info.max_requested; i++) {
				826	struct hfi1_msix_entry *other_msix;
				827
				828	other_msix = &dd->msix_info.msix_entries[i];
				829	if (other_msix->type != IRQ_SDMA \|\| other_msix == msix)
				830	continue;
				831
				832	if (cpumask_test_cpu(old_cpu, &other_msix->mask))
				833	goto unlock;
				834	}
				835	cpumask_clear_cpu(old_cpu, &set->mask);
				836	cpumask_clear_cpu(old_cpu, &set->used);
				837	unlock:
				838	mutex_unlock(&node_affinity.lock);
				839	}
				840
				841	static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
				842	const cpumask_t *mask)
				843	{
				844	int cpu = cpumask_first(mask);
				845	struct hfi1_msix_entry *msix = container_of(notify,
				846	struct hfi1_msix_entry,
				847	notify);
				848
				849	/* Only one CPU configuration supported currently */
				850	hfi1_update_sdma_affinity(msix, cpu);
				851	}
				852
				853	static void hfi1_irq_notifier_release(struct kref *ref)
				854	{
				855	/*
				856	* This is required by affinity notifier. We don't have anything to
				857	* free here.
				858	*/
				859	}
				860
				861	static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
				862	{
				863	struct irq_affinity_notify *notify = &msix->notify;
				864
				865	notify->irq = msix->irq;
				866	notify->notify = hfi1_irq_notifier_notify;
				867	notify->release = hfi1_irq_notifier_release;
				868
				869	if (irq_set_affinity_notifier(notify->irq, notify))
				870	pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
				871	notify->irq);
				872	}
				873
				874	static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
				875	{
				876	struct irq_affinity_notify *notify = &msix->notify;
				877
				878	if (irq_set_affinity_notifier(notify->irq, NULL))
				879	pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
				880	notify->irq);
				881	}
				882
				883	/*
				884	* Function sets the irq affinity for msix.
				885	* It must be called with node_affinity.lock held.
				886	*/
				887	static int get_irq_affinity(struct hfi1_devdata *dd,
				888	struct hfi1_msix_entry *msix)
				889	{
				890	cpumask_var_t diff;
				891	struct hfi1_affinity_node *entry;
				892	struct cpu_mask_set *set = NULL;
				893	struct sdma_engine *sde = NULL;
				894	struct hfi1_ctxtdata *rcd = NULL;
				895	char extra[64];
				896	int cpu = -1;
				897
				898	extra[0] = '\0';
				899	cpumask_clear(&msix->mask);
				900
				901	entry = node_affinity_lookup(dd->node);
				902
				903	switch (msix->type) {
				904	case IRQ_SDMA:
				905	sde = (struct sdma_engine *)msix->arg;
				906	scnprintf(extra, 64, "engine %u", sde->this_idx);
				907	set = &entry->def_intr;
				908	break;
				909	case IRQ_GENERAL:
				910	cpu = cpumask_first(&entry->general_intr_mask);
				911	break;
				912	case IRQ_RCVCTXT:
				913	rcd = (struct hfi1_ctxtdata *)msix->arg;
				914	if (rcd->ctxt == HFI1_CTRL_CTXT)
				915	cpu = cpumask_first(&entry->general_intr_mask);
				916	else
				917	set = &entry->rcv_intr;
				918	scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
				919	break;
				920	default:
				921	dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
				922	return -EINVAL;
				923	}
				924
				925	/*
				926	* The general and control contexts are placed on a particular
				927	* CPU, which is set above. Skip accounting for it. Everything else
				928	* finds its CPU here.
				929	*/
				930	if (cpu == -1 && set) {
				931	if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
				932	return -ENOMEM;
				933
				934	cpu = cpu_mask_set_get_first(set, diff);
				935	if (cpu < 0) {
				936	free_cpumask_var(diff);
				937	dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
				938	return cpu;
				939	}
				940
				941	free_cpumask_var(diff);
				942	}
				943
				944	cpumask_set_cpu(cpu, &msix->mask);
				945	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
				946	msix->irq, irq_type_names[msix->type],
				947	extra, cpu);
				948	irq_set_affinity_hint(msix->irq, &msix->mask);
				949
				950	if (msix->type == IRQ_SDMA) {
				951	sde->cpu = cpu;
				952	hfi1_setup_sdma_notifier(msix);
				953	}
				954
				955	return 0;
				956	}
				957
				958	int hfi1_get_irq_affinity(struct hfi1_devdata dd, struct hfi1_msix_entry msix)
				959	{
				960	int ret;
				961
				962	mutex_lock(&node_affinity.lock);
				963	ret = get_irq_affinity(dd, msix);
				964	mutex_unlock(&node_affinity.lock);
				965	return ret;
				966	}
				967
				968	void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
				969	struct hfi1_msix_entry *msix)
				970	{
				971	struct cpu_mask_set *set = NULL;
				972	struct hfi1_ctxtdata *rcd;
				973	struct hfi1_affinity_node *entry;
				974
				975	mutex_lock(&node_affinity.lock);
				976	entry = node_affinity_lookup(dd->node);
				977
				978	switch (msix->type) {
				979	case IRQ_SDMA:
				980	set = &entry->def_intr;
				981	hfi1_cleanup_sdma_notifier(msix);
				982	break;
				983	case IRQ_GENERAL:
				984	/* Don't do accounting for general contexts */
				985	break;
				986	case IRQ_RCVCTXT:
				987	rcd = (struct hfi1_ctxtdata *)msix->arg;
				988	/* Don't do accounting for control contexts */
				989	if (rcd->ctxt != HFI1_CTRL_CTXT)
				990	set = &entry->rcv_intr;
				991	break;
				992	default:
				993	mutex_unlock(&node_affinity.lock);
				994	return;
				995	}
				996
				997	if (set) {
				998	cpumask_andnot(&set->used, &set->used, &msix->mask);
				999	_cpu_mask_set_gen_dec(set);
				1000	}
				1001
				1002	irq_set_affinity_hint(msix->irq, NULL);
				1003	cpumask_clear(&msix->mask);
				1004	mutex_unlock(&node_affinity.lock);
				1005	}
				1006
				1007	/* This should be called with node_affinity.lock held */
				1008	static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
				1009	struct hfi1_affinity_node_list *affinity)
				1010	{
				1011	int possible, curr_cpu, i;
				1012	uint num_cores_per_socket = node_affinity.num_online_cpus /
				1013	affinity->num_core_siblings /
				1014	node_affinity.num_online_nodes;
				1015
				1016	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
				1017	if (affinity->num_core_siblings > 0) {
				1018	/* Removing other siblings not needed for now */
				1019	possible = cpumask_weight(hw_thread_mask);
				1020	curr_cpu = cpumask_first(hw_thread_mask);
				1021	for (i = 0;
				1022	i < num_cores_per_socket * node_affinity.num_online_nodes;
				1023	i++)
				1024	curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
				1025
				1026	for (; i < possible; i++) {
				1027	cpumask_clear_cpu(curr_cpu, hw_thread_mask);
				1028	curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
				1029	}
				1030
				1031	/* Identifying correct HW threads within physical cores */
				1032	cpumask_shift_left(hw_thread_mask, hw_thread_mask,
				1033	num_cores_per_socket *
				1034	node_affinity.num_online_nodes *
				1035	hw_thread_no);
				1036	}
				1037	}
				1038
				1039	int hfi1_get_proc_affinity(int node)
				1040	{
				1041	int cpu = -1, ret, i;
				1042	struct hfi1_affinity_node *entry;
				1043	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
				1044	const struct cpumask *node_mask,
				1045	*proc_mask = current->cpus_ptr;
				1046	struct hfi1_affinity_node_list *affinity = &node_affinity;
				1047	struct cpu_mask_set *set = &affinity->proc;
				1048
				1049	/*
				1050	* check whether process/context affinity has already
				1051	* been set
				1052	*/
				1053	if (current->nr_cpus_allowed == 1) {
				1054	hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
				1055	current->pid, current->comm,
				1056	cpumask_pr_args(proc_mask));
				1057	/*
				1058	* Mark the pre-set CPU as used. This is atomic so we don't
				1059	* need the lock
				1060	*/
				1061	cpu = cpumask_first(proc_mask);
				1062	cpumask_set_cpu(cpu, &set->used);
				1063	goto done;
				1064	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
				1065	hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
				1066	current->pid, current->comm,
				1067	cpumask_pr_args(proc_mask));
				1068	goto done;
				1069	}
				1070
				1071	/*
				1072	* The process does not have a preset CPU affinity so find one to
				1073	* recommend using the following algorithm:
				1074	*
				1075	* For each user process that is opening a context on HFI Y:
				1076	* a) If all cores are filled, reinitialize the bitmask
				1077	* b) Fill real cores first, then HT cores (First set of HT
				1078	* cores on all physical cores, then second set of HT core,
				1079	* and, so on) in the following order:
				1080	*
				1081	* 1. Same NUMA node as HFI Y and not running an IRQ
				1082	* handler
				1083	* 2. Same NUMA node as HFI Y and running an IRQ handler
				1084	* 3. Different NUMA node to HFI Y and not running an IRQ
				1085	* handler
				1086	* 4. Different NUMA node to HFI Y and running an IRQ
				1087	* handler
				1088	* c) Mark core as filled in the bitmask. As user processes are
				1089	* done, clear cores from the bitmask.
				1090	*/
				1091
				1092	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
				1093	if (!ret)
				1094	goto done;
				1095	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
				1096	if (!ret)
				1097	goto free_diff;
				1098	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
				1099	if (!ret)
				1100	goto free_hw_thread_mask;
				1101	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
				1102	if (!ret)
				1103	goto free_available_mask;
				1104
				1105	mutex_lock(&affinity->lock);
				1106	/*
				1107	* If we've used all available HW threads, clear the mask and start
				1108	* overloading.
				1109	*/
				1110	_cpu_mask_set_gen_inc(set);
				1111
				1112	/*
				1113	* If NUMA node has CPUs used by interrupt handlers, include them in the
				1114	* interrupt handler mask.
				1115	*/
				1116	entry = node_affinity_lookup(node);
				1117	if (entry) {
				1118	cpumask_copy(intrs_mask, (entry->def_intr.gen ?
				1119	&entry->def_intr.mask :
				1120	&entry->def_intr.used));
				1121	cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
				1122	&entry->rcv_intr.mask :
				1123	&entry->rcv_intr.used));
				1124	cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
				1125	}
				1126	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
				1127	cpumask_pr_args(intrs_mask));
				1128
				1129	cpumask_copy(hw_thread_mask, &set->mask);
				1130
				1131	/*
				1132	* If HT cores are enabled, identify which HW threads within the
				1133	* physical cores should be used.
				1134	*/
				1135	if (affinity->num_core_siblings > 0) {
				1136	for (i = 0; i < affinity->num_core_siblings; i++) {
				1137	find_hw_thread_mask(i, hw_thread_mask, affinity);
				1138
				1139	/*
				1140	* If there's at least one available core for this HW
				1141	* thread number, stop looking for a core.
				1142	*
				1143	* diff will always be not empty at least once in this
				1144	* loop as the used mask gets reset when
				1145	* (set->mask == set->used) before this loop.
				1146	*/
				1147	cpumask_andnot(diff, hw_thread_mask, &set->used);
				1148	if (!cpumask_empty(diff))
				1149	break;
				1150	}
				1151	}
				1152	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
				1153	cpumask_pr_args(hw_thread_mask));
				1154
				1155	node_mask = cpumask_of_node(node);
				1156	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
				1157	cpumask_pr_args(node_mask));
				1158
				1159	/* Get cpumask of available CPUs on preferred NUMA */
				1160	cpumask_and(available_mask, hw_thread_mask, node_mask);
				1161	cpumask_andnot(available_mask, available_mask, &set->used);
				1162	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
				1163	cpumask_pr_args(available_mask));
				1164
				1165	/*
				1166	* At first, we don't want to place processes on the same
				1167	* CPUs as interrupt handlers. Then, CPUs running interrupt
				1168	* handlers are used.
				1169	*
				1170	* 1) If diff is not empty, then there are CPUs not running
				1171	* non-interrupt handlers available, so diff gets copied
				1172	* over to available_mask.
				1173	* 2) If diff is empty, then all CPUs not running interrupt
				1174	* handlers are taken, so available_mask contains all
				1175	* available CPUs running interrupt handlers.
				1176	* 3) If available_mask is empty, then all CPUs on the
				1177	* preferred NUMA node are taken, so other NUMA nodes are
				1178	* used for process assignments using the same method as
				1179	* the preferred NUMA node.
				1180	*/
				1181	cpumask_andnot(diff, available_mask, intrs_mask);
				1182	if (!cpumask_empty(diff))
				1183	cpumask_copy(available_mask, diff);
				1184
				1185	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
				1186	if (cpumask_empty(available_mask)) {
				1187	cpumask_andnot(available_mask, hw_thread_mask, &set->used);
				1188	/* Excluding preferred NUMA cores */
				1189	cpumask_andnot(available_mask, available_mask, node_mask);
				1190	hfi1_cdbg(PROC,
				1191	"Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
				1192	cpumask_pr_args(available_mask));
				1193
				1194	/*
				1195	* At first, we don't want to place processes on the same
				1196	* CPUs as interrupt handlers.
				1197	*/
				1198	cpumask_andnot(diff, available_mask, intrs_mask);
				1199	if (!cpumask_empty(diff))
				1200	cpumask_copy(available_mask, diff);
				1201	}
				1202	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
				1203	cpumask_pr_args(available_mask));
				1204
				1205	cpu = cpumask_first(available_mask);
				1206	if (cpu >= nr_cpu_ids) /* empty */
				1207	cpu = -1;
				1208	else
				1209	cpumask_set_cpu(cpu, &set->used);
				1210
				1211	mutex_unlock(&affinity->lock);
				1212	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
				1213
				1214	free_cpumask_var(intrs_mask);
				1215	free_available_mask:
				1216	free_cpumask_var(available_mask);
				1217	free_hw_thread_mask:
				1218	free_cpumask_var(hw_thread_mask);
				1219	free_diff:
				1220	free_cpumask_var(diff);
				1221	done:
				1222	return cpu;
				1223	}
				1224
				1225	void hfi1_put_proc_affinity(int cpu)
				1226	{
				1227	struct hfi1_affinity_node_list *affinity = &node_affinity;
				1228	struct cpu_mask_set *set = &affinity->proc;
				1229
				1230	if (cpu < 0)
				1231	return;
				1232
				1233	mutex_lock(&affinity->lock);
				1234	cpu_mask_set_put(set, cpu);
				1235	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
				1236	mutex_unlock(&affinity->lock);
				1237	}