Blame - marvell/linux/kernel/sched/topology.c - T108

blob: c79e693ee2b28a09b36cc6cd566d956b9dde5611 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
				5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
				10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
				12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
				15	static int __init sched_debug_setup(char *str)
				16	{
				17	sched_debug_enabled = true;
				18
				19	return 0;
				20	}
				21	early_param("sched_debug", sched_debug_setup);
				22
				23	static inline bool sched_debug(void)
				24	{
				25	return sched_debug_enabled;
				26	}
				27
				28	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				29	struct cpumask *groupmask)
				30	{
				31	struct sched_group *group = sd->groups;
				32
				33	cpumask_clear(groupmask);
				34
				35	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
				36
				37	if (!(sd->flags & SD_LOAD_BALANCE)) {
				38	printk("does not load-balance\n");
				39	if (sd->parent)
				40	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
				41	return -1;
				42	}
				43
				44	printk(KERN_CONT "span=%*pbl level=%s\n",
				45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
				48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
				49	}
				50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
				51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
				52	}
				53
				54	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				55	do {
				56	if (!group) {
				57	printk("\n");
				58	printk(KERN_ERR "ERROR: group is NULL\n");
				59	break;
				60	}
				61
				62	if (!cpumask_weight(sched_group_span(group))) {
				63	printk(KERN_CONT "\n");
				64	printk(KERN_ERR "ERROR: empty group\n");
				65	break;
				66	}
				67
				68	if (!(sd->flags & SD_OVERLAP) &&
				69	cpumask_intersects(groupmask, sched_group_span(group))) {
				70	printk(KERN_CONT "\n");
				71	printk(KERN_ERR "ERROR: repeated CPUs\n");
				72	break;
				73	}
				74
				75	cpumask_or(groupmask, groupmask, sched_group_span(group));
				76
				77	printk(KERN_CONT " %d:{ span=%*pbl",
				78	group->sgc->id,
				79	cpumask_pr_args(sched_group_span(group)));
				80
				81	if ((sd->flags & SD_OVERLAP) &&
				82	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
				83	printk(KERN_CONT " mask=%*pbl",
				84	cpumask_pr_args(group_balance_mask(group)));
				85	}
				86
				87	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				88	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
				89
				90	if (group == sd->groups && sd->child &&
				91	!cpumask_equal(sched_domain_span(sd->child),
				92	sched_group_span(group))) {
				93	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				94	}
				95
				96	printk(KERN_CONT " }");
				97
				98	group = group->next;
				99
				100	if (group != sd->groups)
				101	printk(KERN_CONT ",");
				102
				103	} while (group != sd->groups);
				104	printk(KERN_CONT "\n");
				105
				106	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				107	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				108
				109	if (sd->parent &&
				110	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
				111	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
				112	return 0;
				113	}
				114
				115	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				116	{
				117	int level = 0;
				118
				119	if (!sched_debug_enabled)
				120	return;
				121
				122	if (!sd) {
				123	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				124	return;
				125	}
				126
				127	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
				128
				129	for (;;) {
				130	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				131	break;
				132	level++;
				133	sd = sd->parent;
				134	if (!sd)
				135	break;
				136	}
				137	}
				138	#else /* !CONFIG_SCHED_DEBUG */
				139
				140	# define sched_debug_enabled 0
				141	# define sched_domain_debug(sd, cpu) do { } while (0)
				142	static inline bool sched_debug(void)
				143	{
				144	return false;
				145	}
				146	#endif /* CONFIG_SCHED_DEBUG */
				147
				148	static int sd_degenerate(struct sched_domain *sd)
				149	{
				150	if (cpumask_weight(sched_domain_span(sd)) == 1)
				151	return 1;
				152
				153	/* Following flags need at least 2 groups */
				154	if (sd->flags & (SD_LOAD_BALANCE \|
				155	SD_BALANCE_NEWIDLE \|
				156	SD_BALANCE_FORK \|
				157	SD_BALANCE_EXEC \|
				158	SD_SHARE_CPUCAPACITY \|
				159	SD_ASYM_CPUCAPACITY \|
				160	SD_SHARE_PKG_RESOURCES \|
				161	SD_SHARE_POWERDOMAIN)) {
				162	if (sd->groups != sd->groups->next)
				163	return 0;
				164	}
				165
				166	/* Following flags don't use groups */
				167	if (sd->flags & (SD_WAKE_AFFINE))
				168	return 0;
				169
				170	return 1;
				171	}
				172
				173	static int
				174	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				175	{
				176	unsigned long cflags = sd->flags, pflags = parent->flags;
				177
				178	if (sd_degenerate(parent))
				179	return 1;
				180
				181	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				182	return 0;
				183
				184	/* Flags needing groups don't count if only 1 group in parent */
				185	if (parent->groups == parent->groups->next) {
				186	pflags &= ~(SD_LOAD_BALANCE \|
				187	SD_BALANCE_NEWIDLE \|
				188	SD_BALANCE_FORK \|
				189	SD_BALANCE_EXEC \|
				190	SD_ASYM_CPUCAPACITY \|
				191	SD_SHARE_CPUCAPACITY \|
				192	SD_SHARE_PKG_RESOURCES \|
				193	SD_PREFER_SIBLING \|
				194	SD_SHARE_POWERDOMAIN);
				195	if (nr_node_ids == 1)
				196	pflags &= ~SD_SERIALIZE;
				197	}
				198	if (~cflags & pflags)
				199	return 0;
				200
				201	return 1;
				202	}
				203
				204	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
				205	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
				206	unsigned int sysctl_sched_energy_aware = 1;
				207	DEFINE_MUTEX(sched_energy_mutex);
				208	bool sched_energy_update;
				209
				210	#ifdef CONFIG_PROC_SYSCTL
				211	int sched_energy_aware_handler(struct ctl_table *table, int write,
				212	void __user buffer, size_t lenp, loff_t *ppos)
				213	{
				214	int ret, state;
				215
				216	if (write && !capable(CAP_SYS_ADMIN))
				217	return -EPERM;
				218
				219	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				220	if (!ret && write) {
				221	state = static_branch_unlikely(&sched_energy_present);
				222	if (state != sysctl_sched_energy_aware) {
				223	mutex_lock(&sched_energy_mutex);
				224	sched_energy_update = 1;
				225	rebuild_sched_domains();
				226	sched_energy_update = 0;
				227	mutex_unlock(&sched_energy_mutex);
				228	}
				229	}
				230
				231	return ret;
				232	}
				233	#endif
				234
				235	static void free_pd(struct perf_domain *pd)
				236	{
				237	struct perf_domain *tmp;
				238
				239	while (pd) {
				240	tmp = pd->next;
				241	kfree(pd);
				242	pd = tmp;
				243	}
				244	}
				245
				246	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				247	{
				248	while (pd) {
				249	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				250	return pd;
				251	pd = pd->next;
				252	}
				253
				254	return NULL;
				255	}
				256
				257	static struct perf_domain *pd_init(int cpu)
				258	{
				259	struct em_perf_domain *obj = em_cpu_get(cpu);
				260	struct perf_domain *pd;
				261
				262	if (!obj) {
				263	if (sched_debug())
				264	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				265	return NULL;
				266	}
				267
				268	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				269	if (!pd)
				270	return NULL;
				271	pd->em_pd = obj;
				272
				273	return pd;
				274	}
				275
				276	static void perf_domain_debug(const struct cpumask *cpu_map,
				277	struct perf_domain *pd)
				278	{
				279	if (!sched_debug() \|\| !pd)
				280	return;
				281
				282	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				283
				284	while (pd) {
				285	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
				286	cpumask_first(perf_domain_span(pd)),
				287	cpumask_pr_args(perf_domain_span(pd)),
				288	em_pd_nr_cap_states(pd->em_pd));
				289	pd = pd->next;
				290	}
				291
				292	printk(KERN_CONT "\n");
				293	}
				294
				295	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				296	{
				297	struct perf_domain *pd;
				298
				299	pd = container_of(rp, struct perf_domain, rcu);
				300	free_pd(pd);
				301	}
				302
				303	static void sched_energy_set(bool has_eas)
				304	{
				305	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				306	if (sched_debug())
				307	pr_info("%s: stopping EAS\n", __func__);
				308	static_branch_disable_cpuslocked(&sched_energy_present);
				309	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				310	if (sched_debug())
				311	pr_info("%s: starting EAS\n", __func__);
				312	static_branch_enable_cpuslocked(&sched_energy_present);
				313	}
				314	}
				315
				316	/*
				317	* EAS can be used on a root domain if it meets all the following conditions:
				318	* 1. an Energy Model (EM) is available;
				319	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
				320	* 3. the EM complexity is low enough to keep scheduling overheads low;
				321	* 4. schedutil is driving the frequency of all CPUs of the rd;
				322	*
				323	* The complexity of the Energy Model is defined as:
				324	*
				325	* C = nr_pd * (nr_cpus + nr_cs)
				326	*
				327	* with parameters defined as:
				328	* - nr_pd: the number of performance domains
				329	* - nr_cpus: the number of CPUs
				330	* - nr_cs: the sum of the number of capacity states of all performance
				331	* domains (for example, on a system with 2 performance domains,
				332	* with 10 capacity states each, nr_cs = 2 * 10 = 20).
				333	*
				334	* It is generally not a good idea to use such a model in the wake-up path on
				335	* very complex platforms because of the associated scheduling overheads. The
				336	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
				337	* with per-CPU DVFS and less than 8 capacity states each, for example.
				338	*/
				339	#define EM_MAX_COMPLEXITY 2048
				340
				341	extern struct cpufreq_governor schedutil_gov;
				342	static bool build_perf_domains(const struct cpumask *cpu_map)
				343	{
				344	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
				345	struct perf_domain pd = NULL, tmp;
				346	int cpu = cpumask_first(cpu_map);
				347	struct root_domain *rd = cpu_rq(cpu)->rd;
				348	struct cpufreq_policy *policy;
				349	struct cpufreq_governor *gov;
				350
				351	if (!sysctl_sched_energy_aware)
				352	goto free;
				353
				354	/* EAS is enabled for asymmetric CPU capacity topologies. */
				355	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				356	if (sched_debug()) {
				357	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				358	cpumask_pr_args(cpu_map));
				359	}
				360	goto free;
				361	}
				362
				363	for_each_cpu(i, cpu_map) {
				364	/* Skip already covered CPUs. */
				365	if (find_pd(pd, i))
				366	continue;
				367
				368	/* Do not attempt EAS if schedutil is not being used. */
				369	policy = cpufreq_cpu_get(i);
				370	if (!policy)
				371	goto free;
				372	gov = policy->governor;
				373	cpufreq_cpu_put(policy);
				374	if (gov != &schedutil_gov) {
				375	if (rd->pd)
				376	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				377	cpumask_pr_args(cpu_map));
				378	goto free;
				379	}
				380
				381	/* Create the new pd and add it to the local list. */
				382	tmp = pd_init(i);
				383	if (!tmp)
				384	goto free;
				385	tmp->next = pd;
				386	pd = tmp;
				387
				388	/*
				389	* Count performance domains and capacity states for the
				390	* complexity check.
				391	*/
				392	nr_pd++;
				393	nr_cs += em_pd_nr_cap_states(pd->em_pd);
				394	}
				395
				396	/* Bail out if the Energy Model complexity is too high. */
				397	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
				398	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				399	cpumask_pr_args(cpu_map));
				400	goto free;
				401	}
				402
				403	perf_domain_debug(cpu_map, pd);
				404
				405	/* Attach the new list of performance domains to the root domain. */
				406	tmp = rd->pd;
				407	rcu_assign_pointer(rd->pd, pd);
				408	if (tmp)
				409	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				410
				411	return !!pd;
				412
				413	free:
				414	free_pd(pd);
				415	tmp = rd->pd;
				416	rcu_assign_pointer(rd->pd, NULL);
				417	if (tmp)
				418	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				419
				420	return false;
				421	}
				422	#else
				423	static void free_pd(struct perf_domain *pd) { }
				424	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
				425
				426	static void free_rootdomain(struct rcu_head *rcu)
				427	{
				428	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				429
				430	cpupri_cleanup(&rd->cpupri);
				431	cpudl_cleanup(&rd->cpudl);
				432	free_cpumask_var(rd->dlo_mask);
				433	free_cpumask_var(rd->rto_mask);
				434	free_cpumask_var(rd->online);
				435	free_cpumask_var(rd->span);
				436	free_pd(rd->pd);
				437	kfree(rd);
				438	}
				439
				440	void rq_attach_root(struct rq rq, struct root_domain rd)
				441	{
				442	struct root_domain *old_rd = NULL;
				443	unsigned long flags;
				444
				445	raw_spin_lock_irqsave(&rq->lock, flags);
				446
				447	if (rq->rd) {
				448	old_rd = rq->rd;
				449
				450	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				451	set_rq_offline(rq);
				452
				453	cpumask_clear_cpu(rq->cpu, old_rd->span);
				454
				455	/*
				456	* If we dont want to free the old_rd yet then
				457	* set old_rd to NULL to skip the freeing later
				458	* in this function:
				459	*/
				460	if (!atomic_dec_and_test(&old_rd->refcount))
				461	old_rd = NULL;
				462	}
				463
				464	atomic_inc(&rd->refcount);
				465	rq->rd = rd;
				466
				467	cpumask_set_cpu(rq->cpu, rd->span);
				468	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				469	set_rq_online(rq);
				470
				471	raw_spin_unlock_irqrestore(&rq->lock, flags);
				472
				473	if (old_rd)
				474	call_rcu(&old_rd->rcu, free_rootdomain);
				475	}
				476
				477	void sched_get_rd(struct root_domain *rd)
				478	{
				479	atomic_inc(&rd->refcount);
				480	}
				481
				482	void sched_put_rd(struct root_domain *rd)
				483	{
				484	if (!atomic_dec_and_test(&rd->refcount))
				485	return;
				486
				487	call_rcu(&rd->rcu, free_rootdomain);
				488	}
				489
				490	static int init_rootdomain(struct root_domain *rd)
				491	{
				492	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				493	goto out;
				494	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				495	goto free_span;
				496	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				497	goto free_online;
				498	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				499	goto free_dlo_mask;
				500
				501	#ifdef HAVE_RT_PUSH_IPI
				502	rd->rto_cpu = -1;
				503	raw_spin_lock_init(&rd->rto_lock);
				504	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				505	#endif
				506
				507	init_dl_bw(&rd->dl_bw);
				508	if (cpudl_init(&rd->cpudl) != 0)
				509	goto free_rto_mask;
				510
				511	if (cpupri_init(&rd->cpupri) != 0)
				512	goto free_cpudl;
				513
				514	init_max_cpu_capacity(&rd->max_cpu_capacity);
				515
				516	return 0;
				517
				518	free_cpudl:
				519	cpudl_cleanup(&rd->cpudl);
				520	free_rto_mask:
				521	free_cpumask_var(rd->rto_mask);
				522	free_dlo_mask:
				523	free_cpumask_var(rd->dlo_mask);
				524	free_online:
				525	free_cpumask_var(rd->online);
				526	free_span:
				527	free_cpumask_var(rd->span);
				528	out:
				529	return -ENOMEM;
				530	}
				531
				532	/*
				533	* By default the system creates a single root-domain with all CPUs as
				534	* members (mimicking the global state we have today).
				535	*/
				536	struct root_domain def_root_domain;
				537
				538	void init_defrootdomain(void)
				539	{
				540	init_rootdomain(&def_root_domain);
				541
				542	atomic_set(&def_root_domain.refcount, 1);
				543	}
				544
				545	static struct root_domain *alloc_rootdomain(void)
				546	{
				547	struct root_domain *rd;
				548
				549	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
				550	if (!rd)
				551	return NULL;
				552
				553	if (init_rootdomain(rd) != 0) {
				554	kfree(rd);
				555	return NULL;
				556	}
				557
				558	return rd;
				559	}
				560
				561	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				562	{
				563	struct sched_group tmp, first;
				564
				565	if (!sg)
				566	return;
				567
				568	first = sg;
				569	do {
				570	tmp = sg->next;
				571
				572	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				573	kfree(sg->sgc);
				574
				575	if (atomic_dec_and_test(&sg->ref))
				576	kfree(sg);
				577	sg = tmp;
				578	} while (sg != first);
				579	}
				580
				581	static void destroy_sched_domain(struct sched_domain *sd)
				582	{
				583	/*
				584	* A normal sched domain may have multiple group references, an
				585	* overlapping domain, having private groups, only one. Iterate,
				586	* dropping group/capacity references, freeing where none remain.
				587	*/
				588	free_sched_groups(sd->groups, 1);
				589
				590	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				591	kfree(sd->shared);
				592	kfree(sd);
				593	}
				594
				595	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				596	{
				597	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				598
				599	while (sd) {
				600	struct sched_domain *parent = sd->parent;
				601	destroy_sched_domain(sd);
				602	sd = parent;
				603	}
				604	}
				605
				606	static void destroy_sched_domains(struct sched_domain *sd)
				607	{
				608	if (sd)
				609	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				610	}
				611
				612	/*
				613	* Keep a special pointer to the highest sched_domain that has
				614	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				615	* allows us to avoid some pointer chasing select_idle_sibling().
				616	*
				617	* Also keep a unique ID per domain (we use the first CPU number in
				618	* the cpumask of the domain), this allows us to quickly tell if
				619	* two CPUs are in the same cache domain, see cpus_share_cache().
				620	*/
				621	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
				622	DEFINE_PER_CPU(int, sd_llc_size);
				623	DEFINE_PER_CPU(int, sd_llc_id);
				624	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				625	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				626	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				627	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
				628	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
				629
				630	static void update_top_cache_domain(int cpu)
				631	{
				632	struct sched_domain_shared *sds = NULL;
				633	struct sched_domain *sd;
				634	int id = cpu;
				635	int size = 1;
				636
				637	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				638	if (sd) {
				639	id = cpumask_first(sched_domain_span(sd));
				640	size = cpumask_weight(sched_domain_span(sd));
				641	sds = sd->shared;
				642	}
				643
				644	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				645	per_cpu(sd_llc_size, cpu) = size;
				646	per_cpu(sd_llc_id, cpu) = id;
				647	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				648
				649	sd = lowest_flag_domain(cpu, SD_NUMA);
				650	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				651
				652	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
				653	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				654
				655	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
				656	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
				657	}
				658
				659	/*
				660	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				661	* hold the hotplug lock.
				662	*/
				663	static void
				664	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				665	{
				666	struct rq *rq = cpu_rq(cpu);
				667	struct sched_domain *tmp;
				668
				669	/* Remove the sched domains which do not contribute to scheduling. */
				670	for (tmp = sd; tmp; ) {
				671	struct sched_domain *parent = tmp->parent;
				672	if (!parent)
				673	break;
				674
				675	if (sd_parent_degenerate(tmp, parent)) {
				676	tmp->parent = parent->parent;
				677	if (parent->parent)
				678	parent->parent->child = tmp;
				679	/*
				680	* Transfer SD_PREFER_SIBLING down in case of a
				681	* degenerate parent; the spans match for this
				682	* so the property transfers.
				683	*/
				684	if (parent->flags & SD_PREFER_SIBLING)
				685	tmp->flags \|= SD_PREFER_SIBLING;
				686	destroy_sched_domain(parent);
				687	} else
				688	tmp = tmp->parent;
				689	}
				690
				691	if (sd && sd_degenerate(sd)) {
				692	tmp = sd;
				693	sd = sd->parent;
				694	destroy_sched_domain(tmp);
				695	if (sd)
				696	sd->child = NULL;
				697	}
				698
				699	sched_domain_debug(sd, cpu);
				700
				701	rq_attach_root(rq, rd);
				702	tmp = rq->sd;
				703	rcu_assign_pointer(rq->sd, sd);
				704	dirty_sched_domain_sysctl(cpu);
				705	destroy_sched_domains(tmp);
				706
				707	update_top_cache_domain(cpu);
				708	}
				709
				710	struct s_data {
				711	struct sched_domain * __percpu *sd;
				712	struct root_domain *rd;
				713	};
				714
				715	enum s_alloc {
				716	sa_rootdomain,
				717	sa_sd,
				718	sa_sd_storage,
				719	sa_none,
				720	};
				721
				722	/*
				723	* Return the canonical balance CPU for this group, this is the first CPU
				724	* of this group that's also in the balance mask.
				725	*
				726	* The balance mask are all those CPUs that could actually end up at this
				727	* group. See build_balance_mask().
				728	*
				729	* Also see should_we_balance().
				730	*/
				731	int group_balance_cpu(struct sched_group *sg)
				732	{
				733	return cpumask_first(group_balance_mask(sg));
				734	}
				735
				736
				737	/*
				738	* NUMA topology (first read the regular topology blurb below)
				739	*
				740	* Given a node-distance table, for example:
				741	*
				742	* node 0 1 2 3
				743	* 0: 10 20 30 20
				744	* 1: 20 10 20 30
				745	* 2: 30 20 10 20
				746	* 3: 20 30 20 10
				747	*
				748	* which represents a 4 node ring topology like:
				749	*
				750	* 0 ----- 1
				751	* \| \|
				752	* \| \|
				753	* \| \|
				754	* 3 ----- 2
				755	*
				756	* We want to construct domains and groups to represent this. The way we go
				757	* about doing this is to build the domains on 'hops'. For each NUMA level we
				758	* construct the mask of all nodes reachable in @level hops.
				759	*
				760	* For the above NUMA topology that gives 3 levels:
				761	*
				762	* NUMA-2 0-3 0-3 0-3 0-3
				763	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				764	*
				765	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				766	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				767	*
				768	* NUMA-0 0 1 2 3
				769	*
				770	*
				771	* As can be seen; things don't nicely line up as with the regular topology.
				772	* When we iterate a domain in child domain chunks some nodes can be
				773	* represented multiple times -- hence the "overlap" naming for this part of
				774	* the topology.
				775	*
				776	* In order to minimize this overlap, we only build enough groups to cover the
				777	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				778	*
				779	* Because:
				780	*
				781	* - the first group of each domain is its child domain; this
				782	* gets us the first 0-1,3
				783	* - the only uncovered node is 2, who's child domain is 1-3.
				784	*
				785	* However, because of the overlap, computing a unique CPU for each group is
				786	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				787	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				788	* end up at those groups (they would end up in group: 0-1,3).
				789	*
				790	* To correct this we have to introduce the group balance mask. This mask
				791	* will contain those CPUs in the group that can reach this group given the
				792	* (child) domain tree.
				793	*
				794	* With this we can once again compute balance_cpu and sched_group_capacity
				795	* relations.
				796	*
				797	* XXX include words on how balance_cpu is unique and therefore can be
				798	* used for sched_group_capacity links.
				799	*
				800	*
				801	* Another 'interesting' topology is:
				802	*
				803	* node 0 1 2 3
				804	* 0: 10 20 20 30
				805	* 1: 20 10 20 20
				806	* 2: 20 20 10 20
				807	* 3: 30 20 20 10
				808	*
				809	* Which looks a little like:
				810	*
				811	* 0 ----- 1
				812	* \| / \|
				813	* \| / \|
				814	* \| / \|
				815	* 2 ----- 3
				816	*
				817	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				818	* are not.
				819	*
				820	* This leads to a few particularly weird cases where the sched_domain's are
				821	* not of the same number for each CPU. Consider:
				822	*
				823	* NUMA-2 0-3 0-3
				824	* groups: {0-2},{1-3} {1-3},{0-2}
				825	*
				826	* NUMA-1 0-2 0-3 0-3 1-3
				827	*
				828	* NUMA-0 0 1 2 3
				829	*
				830	*/
				831
				832
				833	/*
				834	* Build the balance mask; it contains only those CPUs that can arrive at this
				835	* group and should be considered to continue balancing.
				836	*
				837	* We do this during the group creation pass, therefore the group information
				838	* isn't complete yet, however since each group represents a (child) domain we
				839	* can fully construct this using the sched_domain bits (which are already
				840	* complete).
				841	*/
				842	static void
				843	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
				844	{
				845	const struct cpumask *sg_span = sched_group_span(sg);
				846	struct sd_data *sdd = sd->private;
				847	struct sched_domain *sibling;
				848	int i;
				849
				850	cpumask_clear(mask);
				851
				852	for_each_cpu(i, sg_span) {
				853	sibling = *per_cpu_ptr(sdd->sd, i);
				854
				855	/*
				856	* Can happen in the asymmetric case, where these siblings are
				857	* unused. The mask will not be empty because those CPUs that
				858	* do have the top domain _should_ span the domain.
				859	*/
				860	if (!sibling->child)
				861	continue;
				862
				863	/* If we would not end up here, we can't continue from here */
				864	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
				865	continue;
				866
				867	cpumask_set_cpu(i, mask);
				868	}
				869
				870	/* We must not have empty masks here */
				871	WARN_ON_ONCE(cpumask_empty(mask));
				872	}
				873
				874	/*
				875	* XXX: This creates per-node group entries; since the load-balancer will
				876	* immediately access remote memory to construct this group's load-balance
				877	* statistics having the groups node local is of dubious benefit.
				878	*/
				879	static struct sched_group *
				880	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				881	{
				882	struct sched_group *sg;
				883	struct cpumask *sg_span;
				884
				885	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				886	GFP_KERNEL, cpu_to_node(cpu));
				887
				888	if (!sg)
				889	return NULL;
				890
				891	sg_span = sched_group_span(sg);
				892	if (sd->child)
				893	cpumask_copy(sg_span, sched_domain_span(sd->child));
				894	else
				895	cpumask_copy(sg_span, sched_domain_span(sd));
				896
				897	atomic_inc(&sg->ref);
				898	return sg;
				899	}
				900
				901	static void init_overlap_sched_group(struct sched_domain *sd,
				902	struct sched_group *sg)
				903	{
				904	struct cpumask *mask = sched_domains_tmpmask2;
				905	struct sd_data *sdd = sd->private;
				906	struct cpumask *sg_span;
				907	int cpu;
				908
				909	build_balance_mask(sd, sg, mask);
				910	cpu = cpumask_first_and(sched_group_span(sg), mask);
				911
				912	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				913	if (atomic_inc_return(&sg->sgc->ref) == 1)
				914	cpumask_copy(group_balance_mask(sg), mask);
				915	else
				916	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
				917
				918	/*
				919	* Initialize sgc->capacity such that even if we mess up the
				920	* domains and no possible iteration will get us here, we won't
				921	* die on a /0 trap.
				922	*/
				923	sg_span = sched_group_span(sg);
				924	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				925	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
				926	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
				927	}
				928
				929	static int
				930	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				931	{
				932	struct sched_group first = NULL, last = NULL, *sg;
				933	const struct cpumask *span = sched_domain_span(sd);
				934	struct cpumask *covered = sched_domains_tmpmask;
				935	struct sd_data *sdd = sd->private;
				936	struct sched_domain *sibling;
				937	int i;
				938
				939	cpumask_clear(covered);
				940
				941	for_each_cpu_wrap(i, span, cpu) {
				942	struct cpumask *sg_span;
				943
				944	if (cpumask_test_cpu(i, covered))
				945	continue;
				946
				947	sibling = *per_cpu_ptr(sdd->sd, i);
				948
				949	/*
				950	* Asymmetric node setups can result in situations where the
				951	* domain tree is of unequal depth, make sure to skip domains
				952	* that already cover the entire range.
				953	*
				954	* In that case build_sched_domains() will have terminated the
				955	* iteration early and our sibling sd spans will be empty.
				956	* Domains should always include the CPU they're built on, so
				957	* check that.
				958	*/
				959	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				960	continue;
				961
				962	sg = build_group_from_child_sched_domain(sibling, cpu);
				963	if (!sg)
				964	goto fail;
				965
				966	sg_span = sched_group_span(sg);
				967	cpumask_or(covered, covered, sg_span);
				968
				969	init_overlap_sched_group(sd, sg);
				970
				971	if (!first)
				972	first = sg;
				973	if (last)
				974	last->next = sg;
				975	last = sg;
				976	last->next = first;
				977	}
				978	sd->groups = first;
				979
				980	return 0;
				981
				982	fail:
				983	free_sched_groups(first, 0);
				984
				985	return -ENOMEM;
				986	}
				987
				988
				989	/*
				990	* Package topology (also see the load-balance blurb in fair.c)
				991	*
				992	* The scheduler builds a tree structure to represent a number of important
				993	* topology features. By default (default_topology[]) these include:
				994	*
				995	* - Simultaneous multithreading (SMT)
				996	* - Multi-Core Cache (MC)
				997	* - Package (DIE)
				998	*
				999	* Where the last one more or less denotes everything up to a NUMA node.
				1000	*
				1001	* The tree consists of 3 primary data structures:
				1002	*
				1003	* sched_domain -> sched_group -> sched_group_capacity
				1004	* ^ ^ ^ ^
				1005	* `-' `-'
				1006	*
				1007	* The sched_domains are per-CPU and have a two way link (parent & child) and
				1008	* denote the ever growing mask of CPUs belonging to that level of topology.
				1009	*
				1010	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1011	* denoting the domains of the level below (or individual CPUs in case of the
				1012	* first domain level). The sched_group linked by a sched_domain includes the
				1013	* CPU of that sched_domain [*].
				1014	*
				1015	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1016	*
				1017	* CPU 0 1 2 3 4 5 6 7
				1018	*
				1019	* DIE [ ]
				1020	* MC [ ] [ ]
				1021	* SMT [ ] [ ] [ ] [ ]
				1022	*
				1023	* - or -
				1024	*
				1025	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1026	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1027	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1028	*
				1029	* CPU 0 1 2 3 4 5 6 7
				1030	*
				1031	* One way to think about it is: sched_domain moves you up and down among these
				1032	* topology levels, while sched_group moves you sideways through it, at child
				1033	* domain granularity.
				1034	*
				1035	* sched_group_capacity ensures each unique sched_group has shared storage.
				1036	*
				1037	* There are two related construction problems, both require a CPU that
				1038	* uniquely identify each group (for a given domain):
				1039	*
				1040	* - The first is the balance_cpu (see should_we_balance() and the
				1041	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1042	* continue balancing at a higher domain.
				1043	*
				1044	* - The second is the sched_group_capacity; we want all identical groups
				1045	* to share a single sched_group_capacity.
				1046	*
				1047	* Since these topologies are exclusive by construction. That is, its
				1048	* impossible for an SMT thread to belong to multiple cores, and cores to
				1049	* be part of multiple caches. There is a very clear and unique location
				1050	* for each CPU in the hierarchy.
				1051	*
				1052	* Therefore computing a unique CPU for each group is trivial (the iteration
				1053	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1054	* group), we can simply pick the first CPU in each group.
				1055	*
				1056	*
				1057	* [*] in other words, the first group of each domain is its child domain.
				1058	*/
				1059
				1060	static struct sched_group get_group(int cpu, struct sd_data sdd)
				1061	{
				1062	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1063	struct sched_domain *child = sd->child;
				1064	struct sched_group *sg;
				1065	bool already_visited;
				1066
				1067	if (child)
				1068	cpu = cpumask_first(sched_domain_span(child));
				1069
				1070	sg = *per_cpu_ptr(sdd->sg, cpu);
				1071	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				1072
				1073	/* Increase refcounts for claim_allocations: */
				1074	already_visited = atomic_inc_return(&sg->ref) > 1;
				1075	/* sgc visits should follow a similar trend as sg */
				1076	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1077
				1078	/* If we have already visited that group, it's already initialized. */
				1079	if (already_visited)
				1080	return sg;
				1081
				1082	if (child) {
				1083	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1084	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
				1085	} else {
				1086	cpumask_set_cpu(cpu, sched_group_span(sg));
				1087	cpumask_set_cpu(cpu, group_balance_mask(sg));
				1088	}
				1089
				1090	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
				1091	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
				1092	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
				1093
				1094	return sg;
				1095	}
				1096
				1097	/*
				1098	* build_sched_groups will build a circular linked list of the groups
				1099	* covered by the given span, will set each group's ->cpumask correctly,
				1100	* and will initialize their ->sgc.
				1101	*
				1102	* Assumes the sched_domain tree is fully constructed
				1103	*/
				1104	static int
				1105	build_sched_groups(struct sched_domain *sd, int cpu)
				1106	{
				1107	struct sched_group first = NULL, last = NULL;
				1108	struct sd_data *sdd = sd->private;
				1109	const struct cpumask *span = sched_domain_span(sd);
				1110	struct cpumask *covered;
				1111	int i;
				1112
				1113	lockdep_assert_held(&sched_domains_mutex);
				1114	covered = sched_domains_tmpmask;
				1115
				1116	cpumask_clear(covered);
				1117
				1118	for_each_cpu_wrap(i, span, cpu) {
				1119	struct sched_group *sg;
				1120
				1121	if (cpumask_test_cpu(i, covered))
				1122	continue;
				1123
				1124	sg = get_group(i, sdd);
				1125
				1126	cpumask_or(covered, covered, sched_group_span(sg));
				1127
				1128	if (!first)
				1129	first = sg;
				1130	if (last)
				1131	last->next = sg;
				1132	last = sg;
				1133	}
				1134	last->next = first;
				1135	sd->groups = first;
				1136
				1137	return 0;
				1138	}
				1139
				1140	/*
				1141	* Initialize sched groups cpu_capacity.
				1142	*
				1143	* cpu_capacity indicates the capacity of sched group, which is used while
				1144	* distributing the load between different sched groups in a sched domain.
				1145	* Typically cpu_capacity for all the groups in a sched domain will be same
				1146	* unless there are asymmetries in the topology. If there are asymmetries,
				1147	* group having more cpu_capacity will pickup more load compared to the
				1148	* group having less cpu_capacity.
				1149	*/
				1150	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1151	{
				1152	struct sched_group *sg = sd->groups;
				1153
				1154	WARN_ON(!sg);
				1155
				1156	do {
				1157	int cpu, max_cpu = -1;
				1158
				1159	sg->group_weight = cpumask_weight(sched_group_span(sg));
				1160
				1161	if (!(sd->flags & SD_ASYM_PACKING))
				1162	goto next;
				1163
				1164	for_each_cpu(cpu, sched_group_span(sg)) {
				1165	if (max_cpu < 0)
				1166	max_cpu = cpu;
				1167	else if (sched_asym_prefer(cpu, max_cpu))
				1168	max_cpu = cpu;
				1169	}
				1170	sg->asym_prefer_cpu = max_cpu;
				1171
				1172	next:
				1173	sg = sg->next;
				1174	} while (sg != sd->groups);
				1175
				1176	if (cpu != group_balance_cpu(sg))
				1177	return;
				1178
				1179	update_group_capacity(sd, cpu);
				1180	}
				1181
				1182	/*
				1183	* Initializers for schedule domains
				1184	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1185	*/
				1186
				1187	static int default_relax_domain_level = -1;
				1188	int sched_domain_level_max;
				1189
				1190	static int __init setup_relax_domain_level(char *str)
				1191	{
				1192	if (kstrtoint(str, 0, &default_relax_domain_level))
				1193	pr_warn("Unable to set relax_domain_level\n");
				1194
				1195	return 1;
				1196	}
				1197	__setup("relax_domain_level=", setup_relax_domain_level);
				1198
				1199	static void set_domain_attribute(struct sched_domain *sd,
				1200	struct sched_domain_attr *attr)
				1201	{
				1202	int request;
				1203
				1204	if (!attr \|\| attr->relax_domain_level < 0) {
				1205	if (default_relax_domain_level < 0)
				1206	return;
				1207	request = default_relax_domain_level;
				1208	} else
				1209	request = attr->relax_domain_level;
				1210
				1211	if (sd->level >= request) {
				1212	/* Turn off idle balance on this domain: */
				1213	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
				1214	}
				1215	}
				1216
				1217	static void __sdt_free(const struct cpumask *cpu_map);
				1218	static int __sdt_alloc(const struct cpumask *cpu_map);
				1219
				1220	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1221	const struct cpumask *cpu_map)
				1222	{
				1223	switch (what) {
				1224	case sa_rootdomain:
				1225	if (!atomic_read(&d->rd->refcount))
				1226	free_rootdomain(&d->rd->rcu);
				1227	/* Fall through */
				1228	case sa_sd:
				1229	free_percpu(d->sd);
				1230	/* Fall through */
				1231	case sa_sd_storage:
				1232	__sdt_free(cpu_map);
				1233	/* Fall through */
				1234	case sa_none:
				1235	break;
				1236	}
				1237	}
				1238
				1239	static enum s_alloc
				1240	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1241	{
				1242	memset(d, 0, sizeof(*d));
				1243
				1244	if (__sdt_alloc(cpu_map))
				1245	return sa_sd_storage;
				1246	d->sd = alloc_percpu(struct sched_domain *);
				1247	if (!d->sd)
				1248	return sa_sd_storage;
				1249	d->rd = alloc_rootdomain();
				1250	if (!d->rd)
				1251	return sa_sd;
				1252
				1253	return sa_rootdomain;
				1254	}
				1255
				1256	/*
				1257	* NULL the sd_data elements we've used to build the sched_domain and
				1258	* sched_group structure so that the subsequent __free_domain_allocs()
				1259	* will not free the data we're using.
				1260	*/
				1261	static void claim_allocations(int cpu, struct sched_domain *sd)
				1262	{
				1263	struct sd_data *sdd = sd->private;
				1264
				1265	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1266	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1267
				1268	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1269	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1270
				1271	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1272	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1273
				1274	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1275	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1276	}
				1277
				1278	#ifdef CONFIG_NUMA
				1279	enum numa_topology_type sched_numa_topology_type;
				1280
				1281	static int sched_domains_numa_levels;
				1282	static int sched_domains_curr_level;
				1283
				1284	int sched_max_numa_distance;
				1285	static int *sched_domains_numa_distance;
				1286	static struct cpumask ***sched_domains_numa_masks;
				1287	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
				1288	#endif
				1289
				1290	/*
				1291	* SD_flags allowed in topology descriptions.
				1292	*
				1293	* These flags are purely descriptive of the topology and do not prescribe
				1294	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1295	* function:
				1296	*
				1297	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1298	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1299	* SD_NUMA - describes NUMA topologies
				1300	* SD_SHARE_POWERDOMAIN - describes shared power domain
				1301	*
				1302	* Odd one out, which beside describing the topology has a quirk also
				1303	* prescribes the desired behaviour that goes along with it:
				1304	*
				1305	* SD_ASYM_PACKING - describes SMT quirks
				1306	*/
				1307	#define TOPOLOGY_SD_FLAGS \
				1308	(SD_SHARE_CPUCAPACITY \| \
				1309	SD_SHARE_PKG_RESOURCES \| \
				1310	SD_NUMA \| \
				1311	SD_ASYM_PACKING \| \
				1312	SD_SHARE_POWERDOMAIN)
				1313
				1314	static struct sched_domain *
				1315	sd_init(struct sched_domain_topology_level *tl,
				1316	const struct cpumask *cpu_map,
				1317	struct sched_domain *child, int dflags, int cpu)
				1318	{
				1319	struct sd_data *sdd = &tl->data;
				1320	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1321	int sd_id, sd_weight, sd_flags = 0;
				1322
				1323	#ifdef CONFIG_NUMA
				1324	/*
				1325	* Ugly hack to pass state to sd_numa_mask()...
				1326	*/
				1327	sched_domains_curr_level = tl->numa_level;
				1328	#endif
				1329
				1330	sd_weight = cpumask_weight(tl->mask(cpu));
				1331
				1332	if (tl->sd_flags)
				1333	sd_flags = (*tl->sd_flags)();
				1334	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1335	"wrong sd_flags in topology description\n"))
				1336	sd_flags &= TOPOLOGY_SD_FLAGS;
				1337
				1338	/* Apply detected topology flags */
				1339	sd_flags \|= dflags;
				1340
				1341	*sd = (struct sched_domain){
				1342	.min_interval = sd_weight,
				1343	.max_interval = 2*sd_weight,
				1344	.busy_factor = 32,
				1345	.imbalance_pct = 125,
				1346
				1347	.cache_nice_tries = 0,
				1348
				1349	.flags = 1*SD_LOAD_BALANCE
				1350	\| 1*SD_BALANCE_NEWIDLE
				1351	\| 1*SD_BALANCE_EXEC
				1352	\| 1*SD_BALANCE_FORK
				1353	\| 0*SD_BALANCE_WAKE
				1354	\| 1*SD_WAKE_AFFINE
				1355	\| 0*SD_SHARE_CPUCAPACITY
				1356	\| 0*SD_SHARE_PKG_RESOURCES
				1357	\| 0*SD_SERIALIZE
				1358	\| 1*SD_PREFER_SIBLING
				1359	\| 0*SD_NUMA
				1360	\| sd_flags
				1361	,
				1362
				1363	.last_balance = jiffies,
				1364	.balance_interval = sd_weight,
				1365	.max_newidle_lb_cost = 0,
				1366	.next_decay_max_lb_cost = jiffies,
				1367	.child = child,
				1368	#ifdef CONFIG_SCHED_DEBUG
				1369	.name = tl->name,
				1370	#endif
				1371	};
				1372
				1373	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
				1374	sd_id = cpumask_first(sched_domain_span(sd));
				1375
				1376	/*
				1377	* Convert topological properties into behaviour.
				1378	*/
				1379
				1380	/* Don't attempt to spread across CPUs of different capacities. */
				1381	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
				1382	sd->child->flags &= ~SD_PREFER_SIBLING;
				1383
				1384	if (sd->flags & SD_SHARE_CPUCAPACITY) {
				1385	sd->imbalance_pct = 110;
				1386
				1387	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1388	sd->imbalance_pct = 117;
				1389	sd->cache_nice_tries = 1;
				1390
				1391	#ifdef CONFIG_NUMA
				1392	} else if (sd->flags & SD_NUMA) {
				1393	sd->cache_nice_tries = 2;
				1394
				1395	sd->flags &= ~SD_PREFER_SIBLING;
				1396	sd->flags \|= SD_SERIALIZE;
				1397	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
				1398	sd->flags &= ~(SD_BALANCE_EXEC \|
				1399	SD_BALANCE_FORK \|
				1400	SD_WAKE_AFFINE);
				1401	}
				1402
				1403	#endif
				1404	} else {
				1405	sd->cache_nice_tries = 1;
				1406	}
				1407
				1408	/*
				1409	* For all levels sharing cache; connect a sched_domain_shared
				1410	* instance.
				1411	*/
				1412	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1413	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1414	atomic_inc(&sd->shared->ref);
				1415	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1416	}
				1417
				1418	sd->private = sdd;
				1419
				1420	return sd;
				1421	}
				1422
				1423	/*
				1424	* Topology list, bottom-up.
				1425	*/
				1426	static struct sched_domain_topology_level default_topology[] = {
				1427	#ifdef CONFIG_SCHED_SMT
				1428	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1429	#endif
				1430	#ifdef CONFIG_SCHED_MC
				1431	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1432	#endif
				1433	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1434	{ NULL, },
				1435	};
				1436
				1437	static struct sched_domain_topology_level *sched_domain_topology =
				1438	default_topology;
				1439
				1440	#define for_each_sd_topology(tl) \
				1441	for (tl = sched_domain_topology; tl->mask; tl++)
				1442
				1443	void set_sched_topology(struct sched_domain_topology_level *tl)
				1444	{
				1445	if (WARN_ON_ONCE(sched_smp_initialized))
				1446	return;
				1447
				1448	sched_domain_topology = tl;
				1449	}
				1450
				1451	#ifdef CONFIG_NUMA
				1452
				1453	static const struct cpumask *sd_numa_mask(int cpu)
				1454	{
				1455	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1456	}
				1457
				1458	static void sched_numa_warn(const char *str)
				1459	{
				1460	static int done = false;
				1461	int i,j;
				1462
				1463	if (done)
				1464	return;
				1465
				1466	done = true;
				1467
				1468	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1469
				1470	for (i = 0; i < nr_node_ids; i++) {
				1471	printk(KERN_WARNING " ");
				1472	for (j = 0; j < nr_node_ids; j++)
				1473	printk(KERN_CONT "%02d ", node_distance(i,j));
				1474	printk(KERN_CONT "\n");
				1475	}
				1476	printk(KERN_WARNING "\n");
				1477	}
				1478
				1479	bool find_numa_distance(int distance)
				1480	{
				1481	int i;
				1482
				1483	if (distance == node_distance(0, 0))
				1484	return true;
				1485
				1486	for (i = 0; i < sched_domains_numa_levels; i++) {
				1487	if (sched_domains_numa_distance[i] == distance)
				1488	return true;
				1489	}
				1490
				1491	return false;
				1492	}
				1493
				1494	/*
				1495	* A system can have three types of NUMA topology:
				1496	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1497	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1498	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1499	*
				1500	* The difference between a glueless mesh topology and a backplane
				1501	* topology lies in whether communication between not directly
				1502	* connected nodes goes through intermediary nodes (where programs
				1503	* could run), or through backplane controllers. This affects
				1504	* placement of programs.
				1505	*
				1506	* The type of topology can be discerned with the following tests:
				1507	* - If the maximum distance between any nodes is 1 hop, the system
				1508	* is directly connected.
				1509	* - If for two nodes A and B, located N > 1 hops away from each other,
				1510	* there is an intermediary node C, which is < N hops away from both
				1511	* nodes A and B, the system is a glueless mesh.
				1512	*/
				1513	static void init_numa_topology_type(void)
				1514	{
				1515	int a, b, c, n;
				1516
				1517	n = sched_max_numa_distance;
				1518
				1519	if (sched_domains_numa_levels <= 2) {
				1520	sched_numa_topology_type = NUMA_DIRECT;
				1521	return;
				1522	}
				1523
				1524	for_each_online_node(a) {
				1525	for_each_online_node(b) {
				1526	/* Find two nodes furthest removed from each other. */
				1527	if (node_distance(a, b) < n)
				1528	continue;
				1529
				1530	/* Is there an intermediary node between a and b? */
				1531	for_each_online_node(c) {
				1532	if (node_distance(a, c) < n &&
				1533	node_distance(b, c) < n) {
				1534	sched_numa_topology_type =
				1535	NUMA_GLUELESS_MESH;
				1536	return;
				1537	}
				1538	}
				1539
				1540	sched_numa_topology_type = NUMA_BACKPLANE;
				1541	return;
				1542	}
				1543	}
				1544	}
				1545
				1546
				1547	#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
				1548
				1549	void sched_init_numa(void)
				1550	{
				1551	struct sched_domain_topology_level *tl;
				1552	unsigned long *distance_map;
				1553	int nr_levels = 0;
				1554	int i, j;
				1555
				1556	/*
				1557	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1558	* unique distances in the node_distance() table.
				1559	*/
				1560	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
				1561	if (!distance_map)
				1562	return;
				1563
				1564	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
				1565	for (i = 0; i < nr_node_ids; i++) {
				1566	for (j = 0; j < nr_node_ids; j++) {
				1567	int distance = node_distance(i, j);
				1568
				1569	if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
				1570	sched_numa_warn("Invalid distance value range");
				1571	return;
				1572	}
				1573
				1574	bitmap_set(distance_map, distance, 1);
				1575	}
				1576	}
				1577	/*
				1578	* We can now figure out how many unique distance values there are and
				1579	* allocate memory accordingly.
				1580	*/
				1581	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
				1582
				1583	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
				1584	if (!sched_domains_numa_distance) {
				1585	bitmap_free(distance_map);
				1586	return;
				1587	}
				1588
				1589	for (i = 0, j = 0; i < nr_levels; i++, j++) {
				1590	j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
				1591	sched_domains_numa_distance[i] = j;
				1592	}
				1593
				1594	bitmap_free(distance_map);
				1595
				1596	/*
				1597	* 'nr_levels' contains the number of unique distances
				1598	*
				1599	* The sched_domains_numa_distance[] array includes the actual distance
				1600	* numbers.
				1601	*/
				1602
				1603	/*
				1604	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1605	* If it fails to allocate memory for array sched_domains_numa_masks[][],
				1606	* the array will contain less then 'nr_levels' members. This could be
				1607	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1608	* in other functions.
				1609	*
				1610	* We reset it to 'nr_levels' at the end of this function.
				1611	*/
				1612	sched_domains_numa_levels = 0;
				1613
				1614	sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
				1615	if (!sched_domains_numa_masks)
				1616	return;
				1617
				1618	/*
				1619	* Now for each level, construct a mask per node which contains all
				1620	* CPUs of nodes that are that many hops away from us.
				1621	*/
				1622	for (i = 0; i < nr_levels; i++) {
				1623	sched_domains_numa_masks[i] =
				1624	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1625	if (!sched_domains_numa_masks[i])
				1626	return;
				1627
				1628	for (j = 0; j < nr_node_ids; j++) {
				1629	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
				1630	int k;
				1631
				1632	if (!mask)
				1633	return;
				1634
				1635	sched_domains_numa_masks[i][j] = mask;
				1636
				1637	for_each_node(k) {
				1638	if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
				1639	sched_numa_warn("Node-distance not symmetric");
				1640
				1641	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1642	continue;
				1643
				1644	cpumask_or(mask, mask, cpumask_of_node(k));
				1645	}
				1646	}
				1647	}
				1648
				1649	/* Compute default topology size */
				1650	for (i = 0; sched_domain_topology[i].mask; i++);
				1651
				1652	tl = kzalloc((i + nr_levels + 1) *
				1653	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1654	if (!tl)
				1655	return;
				1656
				1657	/*
				1658	* Copy the default topology bits..
				1659	*/
				1660	for (i = 0; sched_domain_topology[i].mask; i++)
				1661	tl[i] = sched_domain_topology[i];
				1662
				1663	/*
				1664	* Add the NUMA identity distance, aka single NODE.
				1665	*/
				1666	tl[i++] = (struct sched_domain_topology_level){
				1667	.mask = sd_numa_mask,
				1668	.numa_level = 0,
				1669	SD_INIT_NAME(NODE)
				1670	};
				1671
				1672	/*
				1673	* .. and append 'j' levels of NUMA goodness.
				1674	*/
				1675	for (j = 1; j < nr_levels; i++, j++) {
				1676	tl[i] = (struct sched_domain_topology_level){
				1677	.mask = sd_numa_mask,
				1678	.sd_flags = cpu_numa_flags,
				1679	.flags = SDTL_OVERLAP,
				1680	.numa_level = j,
				1681	SD_INIT_NAME(NUMA)
				1682	};
				1683	}
				1684
				1685	sched_domain_topology = tl;
				1686
				1687	sched_domains_numa_levels = nr_levels;
				1688	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
				1689
				1690	init_numa_topology_type();
				1691	}
				1692
				1693	void sched_domains_numa_masks_set(unsigned int cpu)
				1694	{
				1695	int node = cpu_to_node(cpu);
				1696	int i, j;
				1697
				1698	for (i = 0; i < sched_domains_numa_levels; i++) {
				1699	for (j = 0; j < nr_node_ids; j++) {
				1700	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1701	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1702	}
				1703	}
				1704	}
				1705
				1706	void sched_domains_numa_masks_clear(unsigned int cpu)
				1707	{
				1708	int i, j;
				1709
				1710	for (i = 0; i < sched_domains_numa_levels; i++) {
				1711	for (j = 0; j < nr_node_ids; j++)
				1712	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1713	}
				1714	}
				1715
				1716	/*
				1717	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1718	* closest to @cpu from @cpumask.
				1719	* cpumask: cpumask to find a cpu from
				1720	* cpu: cpu to be close to
				1721	*
				1722	* returns: cpu, or nr_cpu_ids when nothing found.
				1723	*/
				1724	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				1725	{
				1726	int i, j = cpu_to_node(cpu);
				1727
				1728	for (i = 0; i < sched_domains_numa_levels; i++) {
				1729	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				1730	if (cpu < nr_cpu_ids)
				1731	return cpu;
				1732	}
				1733	return nr_cpu_ids;
				1734	}
				1735
				1736	#endif /* CONFIG_NUMA */
				1737
				1738	static int __sdt_alloc(const struct cpumask *cpu_map)
				1739	{
				1740	struct sched_domain_topology_level *tl;
				1741	int j;
				1742
				1743	for_each_sd_topology(tl) {
				1744	struct sd_data *sdd = &tl->data;
				1745
				1746	sdd->sd = alloc_percpu(struct sched_domain *);
				1747	if (!sdd->sd)
				1748	return -ENOMEM;
				1749
				1750	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				1751	if (!sdd->sds)
				1752	return -ENOMEM;
				1753
				1754	sdd->sg = alloc_percpu(struct sched_group *);
				1755	if (!sdd->sg)
				1756	return -ENOMEM;
				1757
				1758	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				1759	if (!sdd->sgc)
				1760	return -ENOMEM;
				1761
				1762	for_each_cpu(j, cpu_map) {
				1763	struct sched_domain *sd;
				1764	struct sched_domain_shared *sds;
				1765	struct sched_group *sg;
				1766	struct sched_group_capacity *sgc;
				1767
				1768	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				1769	GFP_KERNEL, cpu_to_node(j));
				1770	if (!sd)
				1771	return -ENOMEM;
				1772
				1773	*per_cpu_ptr(sdd->sd, j) = sd;
				1774
				1775	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				1776	GFP_KERNEL, cpu_to_node(j));
				1777	if (!sds)
				1778	return -ENOMEM;
				1779
				1780	*per_cpu_ptr(sdd->sds, j) = sds;
				1781
				1782	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				1783	GFP_KERNEL, cpu_to_node(j));
				1784	if (!sg)
				1785	return -ENOMEM;
				1786
				1787	sg->next = sg;
				1788
				1789	*per_cpu_ptr(sdd->sg, j) = sg;
				1790
				1791	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				1792	GFP_KERNEL, cpu_to_node(j));
				1793	if (!sgc)
				1794	return -ENOMEM;
				1795
				1796	#ifdef CONFIG_SCHED_DEBUG
				1797	sgc->id = j;
				1798	#endif
				1799
				1800	*per_cpu_ptr(sdd->sgc, j) = sgc;
				1801	}
				1802	}
				1803
				1804	return 0;
				1805	}
				1806
				1807	static void __sdt_free(const struct cpumask *cpu_map)
				1808	{
				1809	struct sched_domain_topology_level *tl;
				1810	int j;
				1811
				1812	for_each_sd_topology(tl) {
				1813	struct sd_data *sdd = &tl->data;
				1814
				1815	for_each_cpu(j, cpu_map) {
				1816	struct sched_domain *sd;
				1817
				1818	if (sdd->sd) {
				1819	sd = *per_cpu_ptr(sdd->sd, j);
				1820	if (sd && (sd->flags & SD_OVERLAP))
				1821	free_sched_groups(sd->groups, 0);
				1822	kfree(*per_cpu_ptr(sdd->sd, j));
				1823	}
				1824
				1825	if (sdd->sds)
				1826	kfree(*per_cpu_ptr(sdd->sds, j));
				1827	if (sdd->sg)
				1828	kfree(*per_cpu_ptr(sdd->sg, j));
				1829	if (sdd->sgc)
				1830	kfree(*per_cpu_ptr(sdd->sgc, j));
				1831	}
				1832	free_percpu(sdd->sd);
				1833	sdd->sd = NULL;
				1834	free_percpu(sdd->sds);
				1835	sdd->sds = NULL;
				1836	free_percpu(sdd->sg);
				1837	sdd->sg = NULL;
				1838	free_percpu(sdd->sgc);
				1839	sdd->sgc = NULL;
				1840	}
				1841	}
				1842
				1843	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
				1844	const struct cpumask cpu_map, struct sched_domain_attr attr,
				1845	struct sched_domain *child, int dflags, int cpu)
				1846	{
				1847	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
				1848
				1849	if (child) {
				1850	sd->level = child->level + 1;
				1851	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				1852	child->parent = sd;
				1853
				1854	if (!cpumask_subset(sched_domain_span(child),
				1855	sched_domain_span(sd))) {
				1856	pr_err("BUG: arch topology borken\n");
				1857	#ifdef CONFIG_SCHED_DEBUG
				1858	pr_err(" the %s domain not a subset of the %s domain\n",
				1859	child->name, sd->name);
				1860	#endif
				1861	/* Fixup, ensure @sd has at least @child CPUs. */
				1862	cpumask_or(sched_domain_span(sd),
				1863	sched_domain_span(sd),
				1864	sched_domain_span(child));
				1865	}
				1866
				1867	}
				1868	set_domain_attribute(sd, attr);
				1869
				1870	return sd;
				1871	}
				1872
				1873	/*
				1874	* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
				1875	* any two given CPUs at this (non-NUMA) topology level.
				1876	*/
				1877	static bool topology_span_sane(struct sched_domain_topology_level *tl,
				1878	const struct cpumask *cpu_map, int cpu)
				1879	{
				1880	int i;
				1881
				1882	/* NUMA levels are allowed to overlap */
				1883	if (tl->flags & SDTL_OVERLAP)
				1884	return true;
				1885
				1886	/*
				1887	* Non-NUMA levels cannot partially overlap - they must be either
				1888	* completely equal or completely disjoint. Otherwise we can end up
				1889	* breaking the sched_group lists - i.e. a later get_group() pass
				1890	* breaks the linking done for an earlier span.
				1891	*/
				1892	for_each_cpu(i, cpu_map) {
				1893	if (i == cpu)
				1894	continue;
				1895	/*
				1896	* We should 'and' all those masks with 'cpu_map' to exactly
				1897	* match the topology we're about to build, but that can only
				1898	* remove CPUs, which only lessens our ability to detect
				1899	* overlaps
				1900	*/
				1901	if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
				1902	cpumask_intersects(tl->mask(cpu), tl->mask(i)))
				1903	return false;
				1904	}
				1905
				1906	return true;
				1907	}
				1908
				1909	/*
				1910	* Find the sched_domain_topology_level where all CPU capacities are visible
				1911	* for all CPUs.
				1912	*/
				1913	static struct sched_domain_topology_level
				1914	asym_cpu_capacity_level(const struct cpumask cpu_map)
				1915	{
				1916	int i, j, asym_level = 0;
				1917	bool asym = false;
				1918	struct sched_domain_topology_level tl, asym_tl = NULL;
				1919	unsigned long cap;
				1920
				1921	/* Is there any asymmetry? */
				1922	cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
				1923
				1924	for_each_cpu(i, cpu_map) {
				1925	if (arch_scale_cpu_capacity(i) != cap) {
				1926	asym = true;
				1927	break;
				1928	}
				1929	}
				1930
				1931	if (!asym)
				1932	return NULL;
				1933
				1934	/*
				1935	* Examine topology from all CPU's point of views to detect the lowest
				1936	* sched_domain_topology_level where a highest capacity CPU is visible
				1937	* to everyone.
				1938	*/
				1939	for_each_cpu(i, cpu_map) {
				1940	unsigned long max_capacity = arch_scale_cpu_capacity(i);
				1941	int tl_id = 0;
				1942
				1943	for_each_sd_topology(tl) {
				1944	if (tl_id < asym_level)
				1945	goto next_level;
				1946
				1947	for_each_cpu_and(j, tl->mask(i), cpu_map) {
				1948	unsigned long capacity;
				1949
				1950	capacity = arch_scale_cpu_capacity(j);
				1951
				1952	if (capacity <= max_capacity)
				1953	continue;
				1954
				1955	max_capacity = capacity;
				1956	asym_level = tl_id;
				1957	asym_tl = tl;
				1958	}
				1959	next_level:
				1960	tl_id++;
				1961	}
				1962	}
				1963
				1964	return asym_tl;
				1965	}
				1966
				1967
				1968	/*
				1969	* Build sched domains for a given set of CPUs and attach the sched domains
				1970	* to the individual CPUs
				1971	*/
				1972	static int
				1973	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				1974	{
				1975	enum s_alloc alloc_state = sa_none;
				1976	struct sched_domain *sd;
				1977	struct s_data d;
				1978	int i, ret = -ENOMEM;
				1979	struct sched_domain_topology_level *tl_asym;
				1980	bool has_asym = false;
				1981
				1982	if (WARN_ON(cpumask_empty(cpu_map)))
				1983	goto error;
				1984
				1985	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				1986	if (alloc_state != sa_rootdomain)
				1987	goto error;
				1988
				1989	tl_asym = asym_cpu_capacity_level(cpu_map);
				1990
				1991	/* Set up domains for CPUs specified by the cpu_map: */
				1992	for_each_cpu(i, cpu_map) {
				1993	struct sched_domain_topology_level *tl;
				1994
				1995	sd = NULL;
				1996	for_each_sd_topology(tl) {
				1997	int dflags = 0;
				1998
				1999	if (tl == tl_asym) {
				2000	dflags \|= SD_ASYM_CPUCAPACITY;
				2001	has_asym = true;
				2002	}
				2003
				2004	if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
				2005	goto error;
				2006
				2007	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
				2008
				2009	if (tl == sched_domain_topology)
				2010	*per_cpu_ptr(d.sd, i) = sd;
				2011	if (tl->flags & SDTL_OVERLAP)
				2012	sd->flags \|= SD_OVERLAP;
				2013	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				2014	break;
				2015	}
				2016	}
				2017
				2018	/* Build the groups for the domains */
				2019	for_each_cpu(i, cpu_map) {
				2020	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2021	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				2022	if (sd->flags & SD_OVERLAP) {
				2023	if (build_overlap_sched_groups(sd, i))
				2024	goto error;
				2025	} else {
				2026	if (build_sched_groups(sd, i))
				2027	goto error;
				2028	}
				2029	}
				2030	}
				2031
				2032	/* Calculate CPU capacity for physical packages and nodes */
				2033	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2034	if (!cpumask_test_cpu(i, cpu_map))
				2035	continue;
				2036
				2037	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2038	claim_allocations(i, sd);
				2039	init_sched_groups_capacity(i, sd);
				2040	}
				2041	}
				2042
				2043	/* Attach the domains */
				2044	rcu_read_lock();
				2045	for_each_cpu(i, cpu_map) {
				2046	sd = *per_cpu_ptr(d.sd, i);
				2047	cpu_attach_domain(sd, d.rd, i);
				2048	}
				2049	rcu_read_unlock();
				2050
				2051	if (has_asym)
				2052	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
				2053
				2054	ret = 0;
				2055	error:
				2056	__free_domain_allocs(&d, alloc_state, cpu_map);
				2057
				2058	return ret;
				2059	}
				2060
				2061	/* Current sched domains: */
				2062	static cpumask_var_t *doms_cur;
				2063
				2064	/* Number of sched domains in 'doms_cur': */
				2065	static int ndoms_cur;
				2066
				2067	/* Attribues of custom domains in 'doms_cur' */
				2068	static struct sched_domain_attr *dattr_cur;
				2069
				2070	/*
				2071	* Special case: If a kmalloc() of a doms_cur partition (array of
				2072	* cpumask) fails, then fallback to a single sched domain,
				2073	* as determined by the single cpumask fallback_doms.
				2074	*/
				2075	static cpumask_var_t fallback_doms;
				2076
				2077	/*
				2078	* arch_update_cpu_topology lets virtualized architectures update the
				2079	* CPU core maps. It is supposed to return 1 if the topology changed
				2080	* or 0 if it stayed the same.
				2081	*/
				2082	int __weak arch_update_cpu_topology(void)
				2083	{
				2084	return 0;
				2085	}
				2086
				2087	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2088	{
				2089	int i;
				2090	cpumask_var_t *doms;
				2091
				2092	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
				2093	if (!doms)
				2094	return NULL;
				2095	for (i = 0; i < ndoms; i++) {
				2096	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2097	free_sched_domains(doms, i);
				2098	return NULL;
				2099	}
				2100	}
				2101	return doms;
				2102	}
				2103
				2104	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2105	{
				2106	unsigned int i;
				2107	for (i = 0; i < ndoms; i++)
				2108	free_cpumask_var(doms[i]);
				2109	kfree(doms);
				2110	}
				2111
				2112	/*
				2113	* Set up scheduler domains and groups. For now this just excludes isolated
				2114	* CPUs, but could be used to exclude other special cases in the future.
				2115	*/
				2116	int sched_init_domains(const struct cpumask *cpu_map)
				2117	{
				2118	int err;
				2119
				2120	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
				2121	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
				2122	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2123
				2124	arch_update_cpu_topology();
				2125	ndoms_cur = 1;
				2126	doms_cur = alloc_sched_domains(ndoms_cur);
				2127	if (!doms_cur)
				2128	doms_cur = &fallback_doms;
				2129	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
				2130	err = build_sched_domains(doms_cur[0], NULL);
				2131	register_sched_domain_sysctl();
				2132
				2133	return err;
				2134	}
				2135
				2136	/*
				2137	* Detach sched domains from a group of CPUs specified in cpu_map
				2138	* These CPUs will now be attached to the NULL domain
				2139	*/
				2140	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2141	{
				2142	unsigned int cpu = cpumask_any(cpu_map);
				2143	int i;
				2144
				2145	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2146	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2147
				2148	rcu_read_lock();
				2149	for_each_cpu(i, cpu_map)
				2150	cpu_attach_domain(NULL, &def_root_domain, i);
				2151	rcu_read_unlock();
				2152	}
				2153
				2154	/* handle null as "default" */
				2155	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2156	struct sched_domain_attr *new, int idx_new)
				2157	{
				2158	struct sched_domain_attr tmp;
				2159
				2160	/* Fast path: */
				2161	if (!new && !cur)
				2162	return 1;
				2163
				2164	tmp = SD_ATTR_INIT;
				2165
				2166	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2167	new ? (new + idx_new) : &tmp,
				2168	sizeof(struct sched_domain_attr));
				2169	}
				2170
				2171	/*
				2172	* Partition sched domains as specified by the 'ndoms_new'
				2173	* cpumasks in the array doms_new[] of cpumasks. This compares
				2174	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2175	* It destroys each deleted domain and builds each new domain.
				2176	*
				2177	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2178	* The masks don't intersect (don't overlap.) We should setup one
				2179	* sched domain for each mask. CPUs not in any of the cpumasks will
				2180	* not be load balanced. If the same cpumask appears both in the
				2181	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2182	* it as it is.
				2183	*
				2184	* The passed in 'doms_new' should be allocated using
				2185	* alloc_sched_domains. This routine takes ownership of it and will
				2186	* free_sched_domains it when done with it. If the caller failed the
				2187	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2188	* and partition_sched_domains() will fallback to the single partition
				2189	* 'fallback_doms', it also forces the domains to be rebuilt.
				2190	*
				2191	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2192	* ndoms_new == 0 is a special case for destroying existing domains,
				2193	* and it will not create the default domain.
				2194	*
				2195	* Call with hotplug lock and sched_domains_mutex held
				2196	*/
				2197	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2198	struct sched_domain_attr *dattr_new)
				2199	{
				2200	bool __maybe_unused has_eas = false;
				2201	int i, j, n;
				2202	int new_topology;
				2203
				2204	lockdep_assert_held(&sched_domains_mutex);
				2205
				2206	/* Always unregister in case we don't destroy any domains: */
				2207	unregister_sched_domain_sysctl();
				2208
				2209	/* Let the architecture update CPU core mappings: */
				2210	new_topology = arch_update_cpu_topology();
				2211
				2212	if (!doms_new) {
				2213	WARN_ON_ONCE(dattr_new);
				2214	n = 0;
				2215	doms_new = alloc_sched_domains(1);
				2216	if (doms_new) {
				2217	n = 1;
				2218	cpumask_and(doms_new[0], cpu_active_mask,
				2219	housekeeping_cpumask(HK_FLAG_DOMAIN));
				2220	}
				2221	} else {
				2222	n = ndoms_new;
				2223	}
				2224
				2225	/* Destroy deleted domains: */
				2226	for (i = 0; i < ndoms_cur; i++) {
				2227	for (j = 0; j < n && !new_topology; j++) {
				2228	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
				2229	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2230	struct root_domain *rd;
				2231
				2232	/*
				2233	* This domain won't be destroyed and as such
				2234	* its dl_bw->total_bw needs to be cleared. It
				2235	* will be recomputed in function
				2236	* update_tasks_root_domain().
				2237	*/
				2238	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2239	dl_clear_root_domain(rd);
				2240	goto match1;
				2241	}
				2242	}
				2243	/* No match - a current sched domain not in new doms_new[] */
				2244	detach_destroy_domains(doms_cur[i]);
				2245	match1:
				2246	;
				2247	}
				2248
				2249	n = ndoms_cur;
				2250	if (!doms_new) {
				2251	n = 0;
				2252	doms_new = &fallback_doms;
				2253	cpumask_and(doms_new[0], cpu_active_mask,
				2254	housekeeping_cpumask(HK_FLAG_DOMAIN));
				2255	}
				2256
				2257	/* Build new domains: */
				2258	for (i = 0; i < ndoms_new; i++) {
				2259	for (j = 0; j < n && !new_topology; j++) {
				2260	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2261	dattrs_equal(dattr_new, i, dattr_cur, j))
				2262	goto match2;
				2263	}
				2264	/* No match - add a new doms_new */
				2265	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2266	match2:
				2267	;
				2268	}
				2269
				2270	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
				2271	/* Build perf. domains: */
				2272	for (i = 0; i < ndoms_new; i++) {
				2273	for (j = 0; j < n && !sched_energy_update; j++) {
				2274	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2275	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2276	has_eas = true;
				2277	goto match3;
				2278	}
				2279	}
				2280	/* No match - add perf. domains for a new rd */
				2281	has_eas \|= build_perf_domains(doms_new[i]);
				2282	match3:
				2283	;
				2284	}
				2285	sched_energy_set(has_eas);
				2286	#endif
				2287
				2288	/* Remember the new sched domains: */
				2289	if (doms_cur != &fallback_doms)
				2290	free_sched_domains(doms_cur, ndoms_cur);
				2291
				2292	kfree(dattr_cur);
				2293	doms_cur = doms_new;
				2294	dattr_cur = dattr_new;
				2295	ndoms_cur = ndoms_new;
				2296
				2297	register_sched_domain_sysctl();
				2298	}
				2299
				2300	/*
				2301	* Call with hotplug lock held
				2302	*/
				2303	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2304	struct sched_domain_attr *dattr_new)
				2305	{
				2306	mutex_lock(&sched_domains_mutex);
				2307	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
				2308	mutex_unlock(&sched_domains_mutex);
				2309	}