b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * SN Platform GRU Driver |
| 4 | * |
| 5 | * MMUOPS callbacks + TLB flushing |
| 6 | * |
| 7 | * This file handles emu notifier callbacks from the core kernel. The callbacks |
| 8 | * are used to update the TLB in the GRU as a result of changes in the |
| 9 | * state of a process address space. This file also handles TLB invalidates |
| 10 | * from the GRU driver. |
| 11 | * |
| 12 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. |
| 13 | */ |
| 14 | |
| 15 | #include <linux/kernel.h> |
| 16 | #include <linux/list.h> |
| 17 | #include <linux/spinlock.h> |
| 18 | #include <linux/mm.h> |
| 19 | #include <linux/slab.h> |
| 20 | #include <linux/device.h> |
| 21 | #include <linux/hugetlb.h> |
| 22 | #include <linux/delay.h> |
| 23 | #include <linux/timex.h> |
| 24 | #include <linux/srcu.h> |
| 25 | #include <asm/processor.h> |
| 26 | #include "gru.h" |
| 27 | #include "grutables.h" |
| 28 | #include <asm/uv/uv_hub.h> |
| 29 | |
| 30 | #define gru_random() get_cycles() |
| 31 | |
| 32 | /* ---------------------------------- TLB Invalidation functions -------- |
| 33 | * get_tgh_handle |
| 34 | * |
| 35 | * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the |
| 36 | * local blade, use a fixed TGH that is a function of the blade-local cpu |
| 37 | * number. Normally, this TGH is private to the cpu & no contention occurs for |
| 38 | * the TGH. For offblade GRUs, select a random TGH in the range above the |
| 39 | * private TGHs. A spinlock is required to access this TGH & the lock must be |
| 40 | * released when the invalidate is completes. This sucks, but it is the best we |
| 41 | * can do. |
| 42 | * |
| 43 | * Note that the spinlock is IN the TGH handle so locking does not involve |
| 44 | * additional cache lines. |
| 45 | * |
| 46 | */ |
| 47 | static inline int get_off_blade_tgh(struct gru_state *gru) |
| 48 | { |
| 49 | int n; |
| 50 | |
| 51 | n = GRU_NUM_TGH - gru->gs_tgh_first_remote; |
| 52 | n = gru_random() % n; |
| 53 | n += gru->gs_tgh_first_remote; |
| 54 | return n; |
| 55 | } |
| 56 | |
| 57 | static inline int get_on_blade_tgh(struct gru_state *gru) |
| 58 | { |
| 59 | return uv_blade_processor_id() >> gru->gs_tgh_local_shift; |
| 60 | } |
| 61 | |
| 62 | static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state |
| 63 | *gru) |
| 64 | { |
| 65 | struct gru_tlb_global_handle *tgh; |
| 66 | int n; |
| 67 | |
| 68 | if (uv_numa_blade_id() == gru->gs_blade_id) |
| 69 | n = get_on_blade_tgh(gru); |
| 70 | else |
| 71 | n = get_off_blade_tgh(gru); |
| 72 | tgh = get_tgh_by_index(gru, n); |
| 73 | lock_tgh_handle(tgh); |
| 74 | |
| 75 | return tgh; |
| 76 | } |
| 77 | |
| 78 | static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh) |
| 79 | { |
| 80 | unlock_tgh_handle(tgh); |
| 81 | } |
| 82 | |
| 83 | /* |
| 84 | * gru_flush_tlb_range |
| 85 | * |
| 86 | * General purpose TLB invalidation function. This function scans every GRU in |
| 87 | * the ENTIRE system (partition) looking for GRUs where the specified MM has |
| 88 | * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR |
| 89 | * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned |
| 90 | * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the |
| 91 | * cost of (possibly) a large number of future TLBmisses. |
| 92 | * |
| 93 | * The current algorithm is optimized based on the following (somewhat true) |
| 94 | * assumptions: |
| 95 | * - GRU contexts are not loaded into a GRU unless a reference is made to |
| 96 | * the data segment or control block (this is true, not an assumption). |
| 97 | * If a DS/CB is referenced, the user will also issue instructions that |
| 98 | * cause TLBmisses. It is not necessary to optimize for the case where |
| 99 | * contexts are loaded but no instructions cause TLB misses. (I know |
| 100 | * this will happen but I'm not optimizing for it). |
| 101 | * - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally |
| 102 | * a few usec but in unusual cases, it could be longer. Avoid if |
| 103 | * possible. |
| 104 | * - intrablade process migration between cpus is not frequent but is |
| 105 | * common. |
| 106 | * - a GRU context is not typically migrated to a different GRU on the |
| 107 | * blade because of intrablade migration |
| 108 | * - interblade migration is rare. Processes migrate their GRU context to |
| 109 | * the new blade. |
| 110 | * - if interblade migration occurs, migration back to the original blade |
| 111 | * is very very rare (ie., no optimization for this case) |
| 112 | * - most GRU instruction operate on a subset of the user REGIONS. Code |
| 113 | * & shared library regions are not likely targets of GRU instructions. |
| 114 | * |
| 115 | * To help improve the efficiency of TLB invalidation, the GMS data |
| 116 | * structure is maintained for EACH address space (MM struct). The GMS is |
| 117 | * also the structure that contains the pointer to the mmu callout |
| 118 | * functions. This structure is linked to the mm_struct for the address space |
| 119 | * using the mmu "register" function. The mmu interfaces are used to |
| 120 | * provide the callbacks for TLB invalidation. The GMS contains: |
| 121 | * |
| 122 | * - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is |
| 123 | * loaded into the GRU. |
| 124 | * - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in |
| 125 | * the above array |
| 126 | * - ctxbitmap[maxgrus]. Indicates the contexts that are currently active |
| 127 | * in the GRU for the address space. This bitmap must be passed to the |
| 128 | * GRU to do an invalidate. |
| 129 | * |
| 130 | * The current algorithm for invalidating TLBs is: |
| 131 | * - scan the asidmap for GRUs where the context has been loaded, ie, |
| 132 | * asid is non-zero. |
| 133 | * - for each gru found: |
| 134 | * - if the ctxtmap is non-zero, there are active contexts in the |
| 135 | * GRU. TLB invalidate instructions must be issued to the GRU. |
| 136 | * - if the ctxtmap is zero, no context is active. Set the ASID to |
| 137 | * zero to force a full TLB invalidation. This is fast but will |
| 138 | * cause a lot of TLB misses if the context is reloaded onto the |
| 139 | * GRU |
| 140 | * |
| 141 | */ |
| 142 | |
| 143 | void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start, |
| 144 | unsigned long len) |
| 145 | { |
| 146 | struct gru_state *gru; |
| 147 | struct gru_mm_tracker *asids; |
| 148 | struct gru_tlb_global_handle *tgh; |
| 149 | unsigned long num; |
| 150 | int grupagesize, pagesize, pageshift, gid, asid; |
| 151 | |
| 152 | /* ZZZ TODO - handle huge pages */ |
| 153 | pageshift = PAGE_SHIFT; |
| 154 | pagesize = (1UL << pageshift); |
| 155 | grupagesize = GRU_PAGESIZE(pageshift); |
| 156 | num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL); |
| 157 | |
| 158 | STAT(flush_tlb); |
| 159 | gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms, |
| 160 | start, len, gms->ms_asidmap[0]); |
| 161 | |
| 162 | spin_lock(&gms->ms_asid_lock); |
| 163 | for_each_gru_in_bitmap(gid, gms->ms_asidmap) { |
| 164 | STAT(flush_tlb_gru); |
| 165 | gru = GID_TO_GRU(gid); |
| 166 | asids = gms->ms_asids + gid; |
| 167 | asid = asids->mt_asid; |
| 168 | if (asids->mt_ctxbitmap && asid) { |
| 169 | STAT(flush_tlb_gru_tgh); |
| 170 | asid = GRUASID(asid, start); |
| 171 | gru_dbg(grudev, |
| 172 | " FLUSH gruid %d, asid 0x%x, vaddr 0x%lx, vamask 0x%x, num %ld, cbmap 0x%x\n", |
| 173 | gid, asid, start, grupagesize, num, asids->mt_ctxbitmap); |
| 174 | tgh = get_lock_tgh_handle(gru); |
| 175 | tgh_invalidate(tgh, start, ~0, asid, grupagesize, 0, |
| 176 | num - 1, asids->mt_ctxbitmap); |
| 177 | get_unlock_tgh_handle(tgh); |
| 178 | } else { |
| 179 | STAT(flush_tlb_gru_zero_asid); |
| 180 | asids->mt_asid = 0; |
| 181 | __clear_bit(gru->gs_gid, gms->ms_asidmap); |
| 182 | gru_dbg(grudev, |
| 183 | " CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n", |
| 184 | gid, asid, asids->mt_ctxbitmap, |
| 185 | gms->ms_asidmap[0]); |
| 186 | } |
| 187 | } |
| 188 | spin_unlock(&gms->ms_asid_lock); |
| 189 | } |
| 190 | |
| 191 | /* |
| 192 | * Flush the entire TLB on a chiplet. |
| 193 | */ |
| 194 | void gru_flush_all_tlb(struct gru_state *gru) |
| 195 | { |
| 196 | struct gru_tlb_global_handle *tgh; |
| 197 | |
| 198 | gru_dbg(grudev, "gid %d\n", gru->gs_gid); |
| 199 | tgh = get_lock_tgh_handle(gru); |
| 200 | tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0xffff); |
| 201 | get_unlock_tgh_handle(tgh); |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | * MMUOPS notifier callout functions |
| 206 | */ |
| 207 | static int gru_invalidate_range_start(struct mmu_notifier *mn, |
| 208 | const struct mmu_notifier_range *range) |
| 209 | { |
| 210 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 211 | ms_notifier); |
| 212 | |
| 213 | STAT(mmu_invalidate_range); |
| 214 | atomic_inc(&gms->ms_range_active); |
| 215 | gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, |
| 216 | range->start, range->end, atomic_read(&gms->ms_range_active)); |
| 217 | gru_flush_tlb_range(gms, range->start, range->end - range->start); |
| 218 | |
| 219 | return 0; |
| 220 | } |
| 221 | |
| 222 | static void gru_invalidate_range_end(struct mmu_notifier *mn, |
| 223 | const struct mmu_notifier_range *range) |
| 224 | { |
| 225 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, |
| 226 | ms_notifier); |
| 227 | |
| 228 | /* ..._and_test() provides needed barrier */ |
| 229 | (void)atomic_dec_and_test(&gms->ms_range_active); |
| 230 | |
| 231 | wake_up_all(&gms->ms_wait_queue); |
| 232 | gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", |
| 233 | gms, range->start, range->end); |
| 234 | } |
| 235 | |
| 236 | static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm) |
| 237 | { |
| 238 | struct gru_mm_struct *gms; |
| 239 | |
| 240 | gms = kzalloc(sizeof(*gms), GFP_KERNEL); |
| 241 | if (!gms) |
| 242 | return ERR_PTR(-ENOMEM); |
| 243 | STAT(gms_alloc); |
| 244 | spin_lock_init(&gms->ms_asid_lock); |
| 245 | init_waitqueue_head(&gms->ms_wait_queue); |
| 246 | |
| 247 | return &gms->ms_notifier; |
| 248 | } |
| 249 | |
| 250 | static void gru_free_notifier(struct mmu_notifier *mn) |
| 251 | { |
| 252 | kfree(container_of(mn, struct gru_mm_struct, ms_notifier)); |
| 253 | STAT(gms_free); |
| 254 | } |
| 255 | |
| 256 | static const struct mmu_notifier_ops gru_mmuops = { |
| 257 | .invalidate_range_start = gru_invalidate_range_start, |
| 258 | .invalidate_range_end = gru_invalidate_range_end, |
| 259 | .alloc_notifier = gru_alloc_notifier, |
| 260 | .free_notifier = gru_free_notifier, |
| 261 | }; |
| 262 | |
| 263 | struct gru_mm_struct *gru_register_mmu_notifier(void) |
| 264 | { |
| 265 | struct mmu_notifier *mn; |
| 266 | |
| 267 | mn = mmu_notifier_get_locked(&gru_mmuops, current->mm); |
| 268 | if (IS_ERR(mn)) |
| 269 | return ERR_CAST(mn); |
| 270 | |
| 271 | return container_of(mn, struct gru_mm_struct, ms_notifier); |
| 272 | } |
| 273 | |
| 274 | void gru_drop_mmu_notifier(struct gru_mm_struct *gms) |
| 275 | { |
| 276 | mmu_notifier_put(&gms->ms_notifier); |
| 277 | } |
| 278 | |
| 279 | /* |
| 280 | * Setup TGH parameters. There are: |
| 281 | * - 24 TGH handles per GRU chiplet |
| 282 | * - a portion (MAX_LOCAL_TGH) of the handles are reserved for |
| 283 | * use by blade-local cpus |
| 284 | * - the rest are used by off-blade cpus. This usage is |
| 285 | * less frequent than blade-local usage. |
| 286 | * |
| 287 | * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade |
| 288 | * has less tan or equal to 16 cpus, each cpu has a unique handle that it can |
| 289 | * use. |
| 290 | */ |
| 291 | #define MAX_LOCAL_TGH 16 |
| 292 | |
| 293 | void gru_tgh_flush_init(struct gru_state *gru) |
| 294 | { |
| 295 | int cpus, shift = 0, n; |
| 296 | |
| 297 | cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id); |
| 298 | |
| 299 | /* n = cpus rounded up to next power of 2 */ |
| 300 | if (cpus) { |
| 301 | n = 1 << fls(cpus - 1); |
| 302 | |
| 303 | /* |
| 304 | * shift count for converting local cpu# to TGH index |
| 305 | * 0 if cpus <= MAX_LOCAL_TGH, |
| 306 | * 1 if cpus <= 2*MAX_LOCAL_TGH, |
| 307 | * etc |
| 308 | */ |
| 309 | shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1)); |
| 310 | } |
| 311 | gru->gs_tgh_local_shift = shift; |
| 312 | |
| 313 | /* first starting TGH index to use for remote purges */ |
| 314 | gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift; |
| 315 | |
| 316 | } |