| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* | 
|  | 2 | * mm/percpu-vm.c - vmalloc area based chunk allocation | 
|  | 3 | * | 
|  | 4 | * Copyright (C) 2010		SUSE Linux Products GmbH | 
|  | 5 | * Copyright (C) 2010		Tejun Heo <tj@kernel.org> | 
|  | 6 | * | 
|  | 7 | * This file is released under the GPLv2. | 
|  | 8 | * | 
|  | 9 | * Chunks are mapped into vmalloc areas and populated page by page. | 
|  | 10 | * This is the default chunk allocator. | 
|  | 11 | */ | 
|  | 12 |  | 
|  | 13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, | 
|  | 14 | unsigned int cpu, int page_idx) | 
|  | 15 | { | 
|  | 16 | /* must not be used on pre-mapped chunk */ | 
|  | 17 | WARN_ON(chunk->immutable); | 
|  | 18 |  | 
|  | 19 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); | 
|  | 20 | } | 
|  | 21 |  | 
|  | 22 | /** | 
|  | 23 | * pcpu_get_pages - get temp pages array | 
|  | 24 | * | 
|  | 25 | * Returns pointer to array of pointers to struct page which can be indexed | 
|  | 26 | * with pcpu_page_idx().  Note that there is only one array and accesses | 
|  | 27 | * should be serialized by pcpu_alloc_mutex. | 
|  | 28 | * | 
|  | 29 | * RETURNS: | 
|  | 30 | * Pointer to temp pages array on success. | 
|  | 31 | */ | 
|  | 32 | static struct page **pcpu_get_pages(void) | 
|  | 33 | { | 
|  | 34 | static struct page **pages; | 
|  | 35 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | 
|  | 36 |  | 
|  | 37 | lockdep_assert_held(&pcpu_alloc_mutex); | 
|  | 38 |  | 
|  | 39 | if (!pages) | 
|  | 40 | pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); | 
|  | 41 | return pages; | 
|  | 42 | } | 
|  | 43 |  | 
|  | 44 | /** | 
|  | 45 | * pcpu_free_pages - free pages which were allocated for @chunk | 
|  | 46 | * @chunk: chunk pages were allocated for | 
|  | 47 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() | 
|  | 48 | * @page_start: page index of the first page to be freed | 
|  | 49 | * @page_end: page index of the last page to be freed + 1 | 
|  | 50 | * | 
|  | 51 | * Free pages [@page_start and @page_end) in @pages for all units. | 
|  | 52 | * The pages were allocated for @chunk. | 
|  | 53 | */ | 
|  | 54 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | 
|  | 55 | struct page **pages, int page_start, int page_end) | 
|  | 56 | { | 
|  | 57 | unsigned int cpu; | 
|  | 58 | int i; | 
|  | 59 |  | 
|  | 60 | for_each_possible_cpu(cpu) { | 
|  | 61 | for (i = page_start; i < page_end; i++) { | 
|  | 62 | struct page *page = pages[pcpu_page_idx(cpu, i)]; | 
|  | 63 |  | 
|  | 64 | if (page) | 
|  | 65 | __free_page(page); | 
|  | 66 | } | 
|  | 67 | } | 
|  | 68 | } | 
|  | 69 |  | 
|  | 70 | /** | 
|  | 71 | * pcpu_alloc_pages - allocates pages for @chunk | 
|  | 72 | * @chunk: target chunk | 
|  | 73 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | 
|  | 74 | * @page_start: page index of the first page to be allocated | 
|  | 75 | * @page_end: page index of the last page to be allocated + 1 | 
|  | 76 | * @gfp: allocation flags passed to the underlying allocator | 
|  | 77 | * | 
|  | 78 | * Allocate pages [@page_start,@page_end) into @pages for all units. | 
|  | 79 | * The allocation is for @chunk.  Percpu core doesn't care about the | 
|  | 80 | * content of @pages and will pass it verbatim to pcpu_map_pages(). | 
|  | 81 | */ | 
|  | 82 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | 
|  | 83 | struct page **pages, int page_start, int page_end, | 
|  | 84 | gfp_t gfp) | 
|  | 85 | { | 
|  | 86 | unsigned int cpu, tcpu; | 
|  | 87 | int i; | 
|  | 88 |  | 
|  | 89 | gfp |= __GFP_HIGHMEM; | 
|  | 90 |  | 
|  | 91 | for_each_possible_cpu(cpu) { | 
|  | 92 | for (i = page_start; i < page_end; i++) { | 
|  | 93 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; | 
|  | 94 |  | 
|  | 95 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); | 
|  | 96 | if (!*pagep) | 
|  | 97 | goto err; | 
|  | 98 | } | 
|  | 99 | } | 
|  | 100 | return 0; | 
|  | 101 |  | 
|  | 102 | err: | 
|  | 103 | while (--i >= page_start) | 
|  | 104 | __free_page(pages[pcpu_page_idx(cpu, i)]); | 
|  | 105 |  | 
|  | 106 | for_each_possible_cpu(tcpu) { | 
|  | 107 | if (tcpu == cpu) | 
|  | 108 | break; | 
|  | 109 | for (i = page_start; i < page_end; i++) | 
|  | 110 | __free_page(pages[pcpu_page_idx(tcpu, i)]); | 
|  | 111 | } | 
|  | 112 | return -ENOMEM; | 
|  | 113 | } | 
|  | 114 |  | 
|  | 115 | /** | 
|  | 116 | * pcpu_pre_unmap_flush - flush cache prior to unmapping | 
|  | 117 | * @chunk: chunk the regions to be flushed belongs to | 
|  | 118 | * @page_start: page index of the first page to be flushed | 
|  | 119 | * @page_end: page index of the last page to be flushed + 1 | 
|  | 120 | * | 
|  | 121 | * Pages in [@page_start,@page_end) of @chunk are about to be | 
|  | 122 | * unmapped.  Flush cache.  As each flushing trial can be very | 
|  | 123 | * expensive, issue flush on the whole region at once rather than | 
|  | 124 | * doing it for each cpu.  This could be an overkill but is more | 
|  | 125 | * scalable. | 
|  | 126 | */ | 
|  | 127 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | 
|  | 128 | int page_start, int page_end) | 
|  | 129 | { | 
|  | 130 | flush_cache_vunmap( | 
|  | 131 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
|  | 132 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
|  | 133 | } | 
|  | 134 |  | 
|  | 135 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | 
|  | 136 | { | 
|  | 137 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); | 
|  | 138 | } | 
|  | 139 |  | 
|  | 140 | /** | 
|  | 141 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk | 
|  | 142 | * @chunk: chunk of interest | 
|  | 143 | * @pages: pages array which can be used to pass information to free | 
|  | 144 | * @page_start: page index of the first page to unmap | 
|  | 145 | * @page_end: page index of the last page to unmap + 1 | 
|  | 146 | * | 
|  | 147 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. | 
|  | 148 | * Corresponding elements in @pages were cleared by the caller and can | 
|  | 149 | * be used to carry information to pcpu_free_pages() which will be | 
|  | 150 | * called after all unmaps are finished.  The caller should call | 
|  | 151 | * proper pre/post flush functions. | 
|  | 152 | */ | 
|  | 153 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | 
|  | 154 | struct page **pages, int page_start, int page_end) | 
|  | 155 | { | 
|  | 156 | unsigned int cpu; | 
|  | 157 | int i; | 
|  | 158 |  | 
|  | 159 | for_each_possible_cpu(cpu) { | 
|  | 160 | for (i = page_start; i < page_end; i++) { | 
|  | 161 | struct page *page; | 
|  | 162 |  | 
|  | 163 | page = pcpu_chunk_page(chunk, cpu, i); | 
|  | 164 | WARN_ON(!page); | 
|  | 165 | pages[pcpu_page_idx(cpu, i)] = page; | 
|  | 166 | } | 
|  | 167 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 
|  | 168 | page_end - page_start); | 
|  | 169 | } | 
|  | 170 | } | 
|  | 171 |  | 
|  | 172 | /** | 
|  | 173 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping | 
|  | 174 | * @chunk: pcpu_chunk the regions to be flushed belong to | 
|  | 175 | * @page_start: page index of the first page to be flushed | 
|  | 176 | * @page_end: page index of the last page to be flushed + 1 | 
|  | 177 | * | 
|  | 178 | * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush | 
|  | 179 | * TLB for the regions.  This can be skipped if the area is to be | 
|  | 180 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. | 
|  | 181 | * | 
|  | 182 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | 
|  | 183 | * for the whole region. | 
|  | 184 | */ | 
|  | 185 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | 
|  | 186 | int page_start, int page_end) | 
|  | 187 | { | 
|  | 188 | flush_tlb_kernel_range( | 
|  | 189 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
|  | 190 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
|  | 191 | } | 
|  | 192 |  | 
|  | 193 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | 
|  | 194 | int nr_pages) | 
|  | 195 | { | 
|  | 196 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, | 
|  | 197 | PAGE_KERNEL, pages); | 
|  | 198 | } | 
|  | 199 |  | 
|  | 200 | /** | 
|  | 201 | * pcpu_map_pages - map pages into a pcpu_chunk | 
|  | 202 | * @chunk: chunk of interest | 
|  | 203 | * @pages: pages array containing pages to be mapped | 
|  | 204 | * @page_start: page index of the first page to map | 
|  | 205 | * @page_end: page index of the last page to map + 1 | 
|  | 206 | * | 
|  | 207 | * For each cpu, map pages [@page_start,@page_end) into @chunk.  The | 
|  | 208 | * caller is responsible for calling pcpu_post_map_flush() after all | 
|  | 209 | * mappings are complete. | 
|  | 210 | * | 
|  | 211 | * This function is responsible for setting up whatever is necessary for | 
|  | 212 | * reverse lookup (addr -> chunk). | 
|  | 213 | */ | 
|  | 214 | static int pcpu_map_pages(struct pcpu_chunk *chunk, | 
|  | 215 | struct page **pages, int page_start, int page_end) | 
|  | 216 | { | 
|  | 217 | unsigned int cpu, tcpu; | 
|  | 218 | int i, err; | 
|  | 219 |  | 
|  | 220 | for_each_possible_cpu(cpu) { | 
|  | 221 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 
|  | 222 | &pages[pcpu_page_idx(cpu, page_start)], | 
|  | 223 | page_end - page_start); | 
|  | 224 | if (err < 0) | 
|  | 225 | goto err; | 
|  | 226 |  | 
|  | 227 | for (i = page_start; i < page_end; i++) | 
|  | 228 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | 
|  | 229 | chunk); | 
|  | 230 | } | 
|  | 231 | return 0; | 
|  | 232 | err: | 
|  | 233 | for_each_possible_cpu(tcpu) { | 
|  | 234 | if (tcpu == cpu) | 
|  | 235 | break; | 
|  | 236 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), | 
|  | 237 | page_end - page_start); | 
|  | 238 | } | 
|  | 239 | pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); | 
|  | 240 | return err; | 
|  | 241 | } | 
|  | 242 |  | 
|  | 243 | /** | 
|  | 244 | * pcpu_post_map_flush - flush cache after mapping | 
|  | 245 | * @chunk: pcpu_chunk the regions to be flushed belong to | 
|  | 246 | * @page_start: page index of the first page to be flushed | 
|  | 247 | * @page_end: page index of the last page to be flushed + 1 | 
|  | 248 | * | 
|  | 249 | * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush | 
|  | 250 | * cache. | 
|  | 251 | * | 
|  | 252 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | 
|  | 253 | * for the whole region. | 
|  | 254 | */ | 
|  | 255 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | 
|  | 256 | int page_start, int page_end) | 
|  | 257 | { | 
|  | 258 | flush_cache_vmap( | 
|  | 259 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), | 
|  | 260 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); | 
|  | 261 | } | 
|  | 262 |  | 
|  | 263 | /** | 
|  | 264 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk | 
|  | 265 | * @chunk: chunk of interest | 
|  | 266 | * @page_start: the start page | 
|  | 267 | * @page_end: the end page | 
|  | 268 | * @gfp: allocation flags passed to the underlying memory allocator | 
|  | 269 | * | 
|  | 270 | * For each cpu, populate and map pages [@page_start,@page_end) into | 
|  | 271 | * @chunk. | 
|  | 272 | * | 
|  | 273 | * CONTEXT: | 
|  | 274 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | 
|  | 275 | */ | 
|  | 276 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, | 
|  | 277 | int page_start, int page_end, gfp_t gfp) | 
|  | 278 | { | 
|  | 279 | struct page **pages; | 
|  | 280 |  | 
|  | 281 | pages = pcpu_get_pages(); | 
|  | 282 | if (!pages) | 
|  | 283 | return -ENOMEM; | 
|  | 284 |  | 
|  | 285 | if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) | 
|  | 286 | return -ENOMEM; | 
|  | 287 |  | 
|  | 288 | if (pcpu_map_pages(chunk, pages, page_start, page_end)) { | 
|  | 289 | pcpu_free_pages(chunk, pages, page_start, page_end); | 
|  | 290 | return -ENOMEM; | 
|  | 291 | } | 
|  | 292 | pcpu_post_map_flush(chunk, page_start, page_end); | 
|  | 293 |  | 
|  | 294 | return 0; | 
|  | 295 | } | 
|  | 296 |  | 
|  | 297 | /** | 
|  | 298 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | 
|  | 299 | * @chunk: chunk to depopulate | 
|  | 300 | * @page_start: the start page | 
|  | 301 | * @page_end: the end page | 
|  | 302 | * | 
|  | 303 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 
|  | 304 | * from @chunk. | 
|  | 305 | * | 
|  | 306 | * CONTEXT: | 
|  | 307 | * pcpu_alloc_mutex. | 
|  | 308 | */ | 
|  | 309 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, | 
|  | 310 | int page_start, int page_end) | 
|  | 311 | { | 
|  | 312 | struct page **pages; | 
|  | 313 |  | 
|  | 314 | /* | 
|  | 315 | * If control reaches here, there must have been at least one | 
|  | 316 | * successful population attempt so the temp pages array must | 
|  | 317 | * be available now. | 
|  | 318 | */ | 
|  | 319 | pages = pcpu_get_pages(); | 
|  | 320 | BUG_ON(!pages); | 
|  | 321 |  | 
|  | 322 | /* unmap and free */ | 
|  | 323 | pcpu_pre_unmap_flush(chunk, page_start, page_end); | 
|  | 324 |  | 
|  | 325 | pcpu_unmap_pages(chunk, pages, page_start, page_end); | 
|  | 326 |  | 
|  | 327 | /* no need to flush tlb, vmalloc will handle it lazily */ | 
|  | 328 |  | 
|  | 329 | pcpu_free_pages(chunk, pages, page_start, page_end); | 
|  | 330 | } | 
|  | 331 |  | 
|  | 332 | static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) | 
|  | 333 | { | 
|  | 334 | struct pcpu_chunk *chunk; | 
|  | 335 | struct vm_struct **vms; | 
|  | 336 |  | 
|  | 337 | chunk = pcpu_alloc_chunk(gfp); | 
|  | 338 | if (!chunk) | 
|  | 339 | return NULL; | 
|  | 340 |  | 
|  | 341 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | 
|  | 342 | pcpu_nr_groups, pcpu_atom_size); | 
|  | 343 | if (!vms) { | 
|  | 344 | pcpu_free_chunk(chunk); | 
|  | 345 | return NULL; | 
|  | 346 | } | 
|  | 347 |  | 
|  | 348 | chunk->data = vms; | 
|  | 349 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; | 
|  | 350 |  | 
|  | 351 | pcpu_stats_chunk_alloc(); | 
|  | 352 | trace_percpu_create_chunk(chunk->base_addr); | 
|  | 353 |  | 
|  | 354 | return chunk; | 
|  | 355 | } | 
|  | 356 |  | 
|  | 357 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) | 
|  | 358 | { | 
|  | 359 | if (!chunk) | 
|  | 360 | return; | 
|  | 361 |  | 
|  | 362 | pcpu_stats_chunk_dealloc(); | 
|  | 363 | trace_percpu_destroy_chunk(chunk->base_addr); | 
|  | 364 |  | 
|  | 365 | if (chunk->data) | 
|  | 366 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); | 
|  | 367 | pcpu_free_chunk(chunk); | 
|  | 368 | } | 
|  | 369 |  | 
|  | 370 | static struct page *pcpu_addr_to_page(void *addr) | 
|  | 371 | { | 
|  | 372 | return vmalloc_to_page(addr); | 
|  | 373 | } | 
|  | 374 |  | 
|  | 375 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) | 
|  | 376 | { | 
|  | 377 | /* no extra restriction */ | 
|  | 378 | return 0; | 
|  | 379 | } |