| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* | 
 | 2 |  * mm/percpu-vm.c - vmalloc area based chunk allocation | 
 | 3 |  * | 
 | 4 |  * Copyright (C) 2010		SUSE Linux Products GmbH | 
 | 5 |  * Copyright (C) 2010		Tejun Heo <tj@kernel.org> | 
 | 6 |  * | 
 | 7 |  * This file is released under the GPLv2. | 
 | 8 |  * | 
 | 9 |  * Chunks are mapped into vmalloc areas and populated page by page. | 
 | 10 |  * This is the default chunk allocator. | 
 | 11 |  */ | 
 | 12 |  | 
 | 13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, | 
 | 14 | 				    unsigned int cpu, int page_idx) | 
 | 15 | { | 
 | 16 | 	/* must not be used on pre-mapped chunk */ | 
 | 17 | 	WARN_ON(chunk->immutable); | 
 | 18 |  | 
 | 19 | 	return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); | 
 | 20 | } | 
 | 21 |  | 
 | 22 | /** | 
 | 23 |  * pcpu_get_pages - get temp pages array | 
 | 24 |  * | 
 | 25 |  * Returns pointer to array of pointers to struct page which can be indexed | 
 | 26 |  * with pcpu_page_idx().  Note that there is only one array and accesses | 
 | 27 |  * should be serialized by pcpu_alloc_mutex. | 
 | 28 |  * | 
 | 29 |  * RETURNS: | 
 | 30 |  * Pointer to temp pages array on success. | 
 | 31 |  */ | 
 | 32 | static struct page **pcpu_get_pages(void) | 
 | 33 | { | 
 | 34 | 	static struct page **pages; | 
 | 35 | 	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | 
 | 36 |  | 
 | 37 | 	lockdep_assert_held(&pcpu_alloc_mutex); | 
 | 38 |  | 
 | 39 | 	if (!pages) | 
 | 40 | 		pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); | 
 | 41 | 	return pages; | 
 | 42 | } | 
 | 43 |  | 
 | 44 | /** | 
 | 45 |  * pcpu_free_pages - free pages which were allocated for @chunk | 
 | 46 |  * @chunk: chunk pages were allocated for | 
 | 47 |  * @pages: array of pages to be freed, indexed by pcpu_page_idx() | 
 | 48 |  * @page_start: page index of the first page to be freed | 
 | 49 |  * @page_end: page index of the last page to be freed + 1 | 
 | 50 |  * | 
 | 51 |  * Free pages [@page_start and @page_end) in @pages for all units. | 
 | 52 |  * The pages were allocated for @chunk. | 
 | 53 |  */ | 
 | 54 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | 
 | 55 | 			    struct page **pages, int page_start, int page_end) | 
 | 56 | { | 
 | 57 | 	unsigned int cpu; | 
 | 58 | 	int i; | 
 | 59 |  | 
 | 60 | 	for_each_possible_cpu(cpu) { | 
 | 61 | 		for (i = page_start; i < page_end; i++) { | 
 | 62 | 			struct page *page = pages[pcpu_page_idx(cpu, i)]; | 
 | 63 |  | 
 | 64 | 			if (page) | 
 | 65 | 				__free_page(page); | 
 | 66 | 		} | 
 | 67 | 	} | 
 | 68 | } | 
 | 69 |  | 
 | 70 | /** | 
 | 71 |  * pcpu_alloc_pages - allocates pages for @chunk | 
 | 72 |  * @chunk: target chunk | 
 | 73 |  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | 
 | 74 |  * @page_start: page index of the first page to be allocated | 
 | 75 |  * @page_end: page index of the last page to be allocated + 1 | 
 | 76 |  * @gfp: allocation flags passed to the underlying allocator | 
 | 77 |  * | 
 | 78 |  * Allocate pages [@page_start,@page_end) into @pages for all units. | 
 | 79 |  * The allocation is for @chunk.  Percpu core doesn't care about the | 
 | 80 |  * content of @pages and will pass it verbatim to pcpu_map_pages(). | 
 | 81 |  */ | 
 | 82 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | 
 | 83 | 			    struct page **pages, int page_start, int page_end, | 
 | 84 | 			    gfp_t gfp) | 
 | 85 | { | 
 | 86 | 	unsigned int cpu, tcpu; | 
 | 87 | 	int i; | 
 | 88 |  | 
 | 89 | 	gfp |= __GFP_HIGHMEM; | 
 | 90 |  | 
 | 91 | 	for_each_possible_cpu(cpu) { | 
 | 92 | 		for (i = page_start; i < page_end; i++) { | 
 | 93 | 			struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; | 
 | 94 |  | 
 | 95 | 			*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); | 
 | 96 | 			if (!*pagep) | 
 | 97 | 				goto err; | 
 | 98 | 		} | 
 | 99 | 	} | 
 | 100 | 	return 0; | 
 | 101 |  | 
 | 102 | err: | 
 | 103 | 	while (--i >= page_start) | 
 | 104 | 		__free_page(pages[pcpu_page_idx(cpu, i)]); | 
 | 105 |  | 
 | 106 | 	for_each_possible_cpu(tcpu) { | 
 | 107 | 		if (tcpu == cpu) | 
 | 108 | 			break; | 
 | 109 | 		for (i = page_start; i < page_end; i++) | 
 | 110 | 			__free_page(pages[pcpu_page_idx(tcpu, i)]); | 
 | 111 | 	} | 
 | 112 | 	return -ENOMEM; | 
 | 113 | } | 
 | 114 |  | 
 | 115 | /** | 
 | 116 |  * pcpu_pre_unmap_flush - flush cache prior to unmapping | 
 | 117 |  * @chunk: chunk the regions to be flushed belongs to | 
 | 118 |  * @page_start: page index of the first page to be flushed | 
 | 119 |  * @page_end: page index of the last page to be flushed + 1 | 
 | 120 |  * | 
 | 121 |  * Pages in [@page_start,@page_end) of @chunk are about to be | 
 | 122 |  * unmapped.  Flush cache.  As each flushing trial can be very | 
 | 123 |  * expensive, issue flush on the whole region at once rather than | 
 | 124 |  * doing it for each cpu.  This could be an overkill but is more | 
 | 125 |  * scalable. | 
 | 126 |  */ | 
 | 127 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | 
 | 128 | 				 int page_start, int page_end) | 
 | 129 | { | 
 | 130 | 	flush_cache_vunmap( | 
 | 131 | 		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
 | 132 | 		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
 | 133 | } | 
 | 134 |  | 
 | 135 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | 
 | 136 | { | 
 | 137 | 	unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); | 
 | 138 | } | 
 | 139 |  | 
 | 140 | /** | 
 | 141 |  * pcpu_unmap_pages - unmap pages out of a pcpu_chunk | 
 | 142 |  * @chunk: chunk of interest | 
 | 143 |  * @pages: pages array which can be used to pass information to free | 
 | 144 |  * @page_start: page index of the first page to unmap | 
 | 145 |  * @page_end: page index of the last page to unmap + 1 | 
 | 146 |  * | 
 | 147 |  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. | 
 | 148 |  * Corresponding elements in @pages were cleared by the caller and can | 
 | 149 |  * be used to carry information to pcpu_free_pages() which will be | 
 | 150 |  * called after all unmaps are finished.  The caller should call | 
 | 151 |  * proper pre/post flush functions. | 
 | 152 |  */ | 
 | 153 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | 
 | 154 | 			     struct page **pages, int page_start, int page_end) | 
 | 155 | { | 
 | 156 | 	unsigned int cpu; | 
 | 157 | 	int i; | 
 | 158 |  | 
 | 159 | 	for_each_possible_cpu(cpu) { | 
 | 160 | 		for (i = page_start; i < page_end; i++) { | 
 | 161 | 			struct page *page; | 
 | 162 |  | 
 | 163 | 			page = pcpu_chunk_page(chunk, cpu, i); | 
 | 164 | 			WARN_ON(!page); | 
 | 165 | 			pages[pcpu_page_idx(cpu, i)] = page; | 
 | 166 | 		} | 
 | 167 | 		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 
 | 168 | 				   page_end - page_start); | 
 | 169 | 	} | 
 | 170 | } | 
 | 171 |  | 
 | 172 | /** | 
 | 173 |  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping | 
 | 174 |  * @chunk: pcpu_chunk the regions to be flushed belong to | 
 | 175 |  * @page_start: page index of the first page to be flushed | 
 | 176 |  * @page_end: page index of the last page to be flushed + 1 | 
 | 177 |  * | 
 | 178 |  * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush | 
 | 179 |  * TLB for the regions.  This can be skipped if the area is to be | 
 | 180 |  * returned to vmalloc as vmalloc will handle TLB flushing lazily. | 
 | 181 |  * | 
 | 182 |  * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | 
 | 183 |  * for the whole region. | 
 | 184 |  */ | 
 | 185 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | 
 | 186 | 				      int page_start, int page_end) | 
 | 187 | { | 
 | 188 | 	flush_tlb_kernel_range( | 
 | 189 | 		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
 | 190 | 		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
 | 191 | } | 
 | 192 |  | 
 | 193 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | 
 | 194 | 			    int nr_pages) | 
 | 195 | { | 
 | 196 | 	return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, | 
 | 197 | 					PAGE_KERNEL, pages); | 
 | 198 | } | 
 | 199 |  | 
 | 200 | /** | 
 | 201 |  * pcpu_map_pages - map pages into a pcpu_chunk | 
 | 202 |  * @chunk: chunk of interest | 
 | 203 |  * @pages: pages array containing pages to be mapped | 
 | 204 |  * @page_start: page index of the first page to map | 
 | 205 |  * @page_end: page index of the last page to map + 1 | 
 | 206 |  * | 
 | 207 |  * For each cpu, map pages [@page_start,@page_end) into @chunk.  The | 
 | 208 |  * caller is responsible for calling pcpu_post_map_flush() after all | 
 | 209 |  * mappings are complete. | 
 | 210 |  * | 
 | 211 |  * This function is responsible for setting up whatever is necessary for | 
 | 212 |  * reverse lookup (addr -> chunk). | 
 | 213 |  */ | 
 | 214 | static int pcpu_map_pages(struct pcpu_chunk *chunk, | 
 | 215 | 			  struct page **pages, int page_start, int page_end) | 
 | 216 | { | 
 | 217 | 	unsigned int cpu, tcpu; | 
 | 218 | 	int i, err; | 
 | 219 |  | 
 | 220 | 	for_each_possible_cpu(cpu) { | 
 | 221 | 		err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 
 | 222 | 				       &pages[pcpu_page_idx(cpu, page_start)], | 
 | 223 | 				       page_end - page_start); | 
 | 224 | 		if (err < 0) | 
 | 225 | 			goto err; | 
 | 226 |  | 
 | 227 | 		for (i = page_start; i < page_end; i++) | 
 | 228 | 			pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | 
 | 229 | 					    chunk); | 
 | 230 | 	} | 
 | 231 | 	return 0; | 
 | 232 | err: | 
 | 233 | 	for_each_possible_cpu(tcpu) { | 
 | 234 | 		if (tcpu == cpu) | 
 | 235 | 			break; | 
 | 236 | 		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), | 
 | 237 | 				   page_end - page_start); | 
 | 238 | 	} | 
 | 239 | 	pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); | 
 | 240 | 	return err; | 
 | 241 | } | 
 | 242 |  | 
 | 243 | /** | 
 | 244 |  * pcpu_post_map_flush - flush cache after mapping | 
 | 245 |  * @chunk: pcpu_chunk the regions to be flushed belong to | 
 | 246 |  * @page_start: page index of the first page to be flushed | 
 | 247 |  * @page_end: page index of the last page to be flushed + 1 | 
 | 248 |  * | 
 | 249 |  * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush | 
 | 250 |  * cache. | 
 | 251 |  * | 
 | 252 |  * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | 
 | 253 |  * for the whole region. | 
 | 254 |  */ | 
 | 255 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | 
 | 256 | 				int page_start, int page_end) | 
 | 257 | { | 
 | 258 | 	flush_cache_vmap( | 
 | 259 | 		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
 | 260 | 		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
 | 261 | } | 
 | 262 |  | 
 | 263 | /** | 
 | 264 |  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk | 
 | 265 |  * @chunk: chunk of interest | 
 | 266 |  * @page_start: the start page | 
 | 267 |  * @page_end: the end page | 
 | 268 |  * @gfp: allocation flags passed to the underlying memory allocator | 
 | 269 |  * | 
 | 270 |  * For each cpu, populate and map pages [@page_start,@page_end) into | 
 | 271 |  * @chunk. | 
 | 272 |  * | 
 | 273 |  * CONTEXT: | 
 | 274 |  * pcpu_alloc_mutex, does GFP_KERNEL allocation. | 
 | 275 |  */ | 
 | 276 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, | 
 | 277 | 			       int page_start, int page_end, gfp_t gfp) | 
 | 278 | { | 
 | 279 | 	struct page **pages; | 
 | 280 |  | 
 | 281 | 	pages = pcpu_get_pages(); | 
 | 282 | 	if (!pages) | 
 | 283 | 		return -ENOMEM; | 
 | 284 |  | 
 | 285 | 	if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) | 
 | 286 | 		return -ENOMEM; | 
 | 287 |  | 
 | 288 | 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) { | 
 | 289 | 		pcpu_free_pages(chunk, pages, page_start, page_end); | 
 | 290 | 		return -ENOMEM; | 
 | 291 | 	} | 
 | 292 | 	pcpu_post_map_flush(chunk, page_start, page_end); | 
 | 293 |  | 
 | 294 | 	return 0; | 
 | 295 | } | 
 | 296 |  | 
 | 297 | /** | 
 | 298 |  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | 
 | 299 |  * @chunk: chunk to depopulate | 
 | 300 |  * @page_start: the start page | 
 | 301 |  * @page_end: the end page | 
 | 302 |  * | 
 | 303 |  * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 
 | 304 |  * from @chunk. | 
 | 305 |  * | 
 | 306 |  * CONTEXT: | 
 | 307 |  * pcpu_alloc_mutex. | 
 | 308 |  */ | 
 | 309 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, | 
 | 310 | 				  int page_start, int page_end) | 
 | 311 | { | 
 | 312 | 	struct page **pages; | 
 | 313 |  | 
 | 314 | 	/* | 
 | 315 | 	 * If control reaches here, there must have been at least one | 
 | 316 | 	 * successful population attempt so the temp pages array must | 
 | 317 | 	 * be available now. | 
 | 318 | 	 */ | 
 | 319 | 	pages = pcpu_get_pages(); | 
 | 320 | 	BUG_ON(!pages); | 
 | 321 |  | 
 | 322 | 	/* unmap and free */ | 
 | 323 | 	pcpu_pre_unmap_flush(chunk, page_start, page_end); | 
 | 324 |  | 
 | 325 | 	pcpu_unmap_pages(chunk, pages, page_start, page_end); | 
 | 326 |  | 
 | 327 | 	/* no need to flush tlb, vmalloc will handle it lazily */ | 
 | 328 |  | 
 | 329 | 	pcpu_free_pages(chunk, pages, page_start, page_end); | 
 | 330 | } | 
 | 331 |  | 
 | 332 | static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) | 
 | 333 | { | 
 | 334 | 	struct pcpu_chunk *chunk; | 
 | 335 | 	struct vm_struct **vms; | 
 | 336 |  | 
 | 337 | 	chunk = pcpu_alloc_chunk(gfp); | 
 | 338 | 	if (!chunk) | 
 | 339 | 		return NULL; | 
 | 340 |  | 
 | 341 | 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | 
 | 342 | 				pcpu_nr_groups, pcpu_atom_size); | 
 | 343 | 	if (!vms) { | 
 | 344 | 		pcpu_free_chunk(chunk); | 
 | 345 | 		return NULL; | 
 | 346 | 	} | 
 | 347 |  | 
 | 348 | 	chunk->data = vms; | 
 | 349 | 	chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; | 
 | 350 |  | 
 | 351 | 	pcpu_stats_chunk_alloc(); | 
 | 352 | 	trace_percpu_create_chunk(chunk->base_addr); | 
 | 353 |  | 
 | 354 | 	return chunk; | 
 | 355 | } | 
 | 356 |  | 
 | 357 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) | 
 | 358 | { | 
 | 359 | 	if (!chunk) | 
 | 360 | 		return; | 
 | 361 |  | 
 | 362 | 	pcpu_stats_chunk_dealloc(); | 
 | 363 | 	trace_percpu_destroy_chunk(chunk->base_addr); | 
 | 364 |  | 
 | 365 | 	if (chunk->data) | 
 | 366 | 		pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); | 
 | 367 | 	pcpu_free_chunk(chunk); | 
 | 368 | } | 
 | 369 |  | 
 | 370 | static struct page *pcpu_addr_to_page(void *addr) | 
 | 371 | { | 
 | 372 | 	return vmalloc_to_page(addr); | 
 | 373 | } | 
 | 374 |  | 
 | 375 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) | 
 | 376 | { | 
 | 377 | 	/* no extra restriction */ | 
 | 378 | 	return 0; | 
 | 379 | } |