b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * /dev/mcelog driver |
| 4 | * |
| 5 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. |
| 6 | * Rest from unknown author(s). |
| 7 | * 2004 Andi Kleen. Rewrote most of it. |
| 8 | * Copyright 2008 Intel Corporation |
| 9 | * Author: Andi Kleen |
| 10 | */ |
| 11 | |
| 12 | #include <linux/miscdevice.h> |
| 13 | #include <linux/slab.h> |
| 14 | #include <linux/kmod.h> |
| 15 | #include <linux/poll.h> |
| 16 | |
| 17 | #include "internal.h" |
| 18 | |
| 19 | static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); |
| 20 | |
| 21 | static DEFINE_MUTEX(mce_chrdev_read_mutex); |
| 22 | |
| 23 | static char mce_helper[128]; |
| 24 | static char *mce_helper_argv[2] = { mce_helper, NULL }; |
| 25 | |
| 26 | /* |
| 27 | * Lockless MCE logging infrastructure. |
| 28 | * This avoids deadlocks on printk locks without having to break locks. Also |
| 29 | * separate MCEs from kernel messages to avoid bogus bug reports. |
| 30 | */ |
| 31 | |
| 32 | static struct mce_log_buffer mcelog = { |
| 33 | .signature = MCE_LOG_SIGNATURE, |
| 34 | .len = MCE_LOG_LEN, |
| 35 | .recordlen = sizeof(struct mce), |
| 36 | }; |
| 37 | |
| 38 | static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); |
| 39 | |
| 40 | static int dev_mce_log(struct notifier_block *nb, unsigned long val, |
| 41 | void *data) |
| 42 | { |
| 43 | struct mce *mce = (struct mce *)data; |
| 44 | unsigned int entry; |
| 45 | |
| 46 | mutex_lock(&mce_chrdev_read_mutex); |
| 47 | |
| 48 | entry = mcelog.next; |
| 49 | |
| 50 | /* |
| 51 | * When the buffer fills up discard new entries. Assume that the |
| 52 | * earlier errors are the more interesting ones: |
| 53 | */ |
| 54 | if (entry >= MCE_LOG_LEN) { |
| 55 | set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); |
| 56 | goto unlock; |
| 57 | } |
| 58 | |
| 59 | mcelog.next = entry + 1; |
| 60 | |
| 61 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); |
| 62 | mcelog.entry[entry].finished = 1; |
| 63 | |
| 64 | /* wake processes polling /dev/mcelog */ |
| 65 | wake_up_interruptible(&mce_chrdev_wait); |
| 66 | |
| 67 | unlock: |
| 68 | mutex_unlock(&mce_chrdev_read_mutex); |
| 69 | |
| 70 | return NOTIFY_OK; |
| 71 | } |
| 72 | |
| 73 | static struct notifier_block dev_mcelog_nb = { |
| 74 | .notifier_call = dev_mce_log, |
| 75 | .priority = MCE_PRIO_MCELOG, |
| 76 | }; |
| 77 | |
| 78 | static void mce_do_trigger(struct work_struct *work) |
| 79 | { |
| 80 | call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); |
| 81 | } |
| 82 | |
| 83 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); |
| 84 | |
| 85 | |
| 86 | void mce_work_trigger(void) |
| 87 | { |
| 88 | if (mce_helper[0]) |
| 89 | schedule_work(&mce_trigger_work); |
| 90 | } |
| 91 | |
| 92 | static ssize_t |
| 93 | show_trigger(struct device *s, struct device_attribute *attr, char *buf) |
| 94 | { |
| 95 | strcpy(buf, mce_helper); |
| 96 | strcat(buf, "\n"); |
| 97 | return strlen(mce_helper) + 1; |
| 98 | } |
| 99 | |
| 100 | static ssize_t set_trigger(struct device *s, struct device_attribute *attr, |
| 101 | const char *buf, size_t siz) |
| 102 | { |
| 103 | char *p; |
| 104 | |
| 105 | strncpy(mce_helper, buf, sizeof(mce_helper)); |
| 106 | mce_helper[sizeof(mce_helper)-1] = 0; |
| 107 | p = strchr(mce_helper, '\n'); |
| 108 | |
| 109 | if (p) |
| 110 | *p = 0; |
| 111 | |
| 112 | return strlen(mce_helper) + !!p; |
| 113 | } |
| 114 | |
| 115 | DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); |
| 116 | |
| 117 | /* |
| 118 | * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. |
| 119 | */ |
| 120 | |
| 121 | static DEFINE_SPINLOCK(mce_chrdev_state_lock); |
| 122 | static int mce_chrdev_open_count; /* #times opened */ |
| 123 | static int mce_chrdev_open_exclu; /* already open exclusive? */ |
| 124 | |
| 125 | static int mce_chrdev_open(struct inode *inode, struct file *file) |
| 126 | { |
| 127 | spin_lock(&mce_chrdev_state_lock); |
| 128 | |
| 129 | if (mce_chrdev_open_exclu || |
| 130 | (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { |
| 131 | spin_unlock(&mce_chrdev_state_lock); |
| 132 | |
| 133 | return -EBUSY; |
| 134 | } |
| 135 | |
| 136 | if (file->f_flags & O_EXCL) |
| 137 | mce_chrdev_open_exclu = 1; |
| 138 | mce_chrdev_open_count++; |
| 139 | |
| 140 | spin_unlock(&mce_chrdev_state_lock); |
| 141 | |
| 142 | return nonseekable_open(inode, file); |
| 143 | } |
| 144 | |
| 145 | static int mce_chrdev_release(struct inode *inode, struct file *file) |
| 146 | { |
| 147 | spin_lock(&mce_chrdev_state_lock); |
| 148 | |
| 149 | mce_chrdev_open_count--; |
| 150 | mce_chrdev_open_exclu = 0; |
| 151 | |
| 152 | spin_unlock(&mce_chrdev_state_lock); |
| 153 | |
| 154 | return 0; |
| 155 | } |
| 156 | |
| 157 | static int mce_apei_read_done; |
| 158 | |
| 159 | /* Collect MCE record of previous boot in persistent storage via APEI ERST. */ |
| 160 | static int __mce_read_apei(char __user **ubuf, size_t usize) |
| 161 | { |
| 162 | int rc; |
| 163 | u64 record_id; |
| 164 | struct mce m; |
| 165 | |
| 166 | if (usize < sizeof(struct mce)) |
| 167 | return -EINVAL; |
| 168 | |
| 169 | rc = apei_read_mce(&m, &record_id); |
| 170 | /* Error or no more MCE record */ |
| 171 | if (rc <= 0) { |
| 172 | mce_apei_read_done = 1; |
| 173 | /* |
| 174 | * When ERST is disabled, mce_chrdev_read() should return |
| 175 | * "no record" instead of "no device." |
| 176 | */ |
| 177 | if (rc == -ENODEV) |
| 178 | return 0; |
| 179 | return rc; |
| 180 | } |
| 181 | rc = -EFAULT; |
| 182 | if (copy_to_user(*ubuf, &m, sizeof(struct mce))) |
| 183 | return rc; |
| 184 | /* |
| 185 | * In fact, we should have cleared the record after that has |
| 186 | * been flushed to the disk or sent to network in |
| 187 | * /sbin/mcelog, but we have no interface to support that now, |
| 188 | * so just clear it to avoid duplication. |
| 189 | */ |
| 190 | rc = apei_clear_mce(record_id); |
| 191 | if (rc) { |
| 192 | mce_apei_read_done = 1; |
| 193 | return rc; |
| 194 | } |
| 195 | *ubuf += sizeof(struct mce); |
| 196 | |
| 197 | return 0; |
| 198 | } |
| 199 | |
| 200 | static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, |
| 201 | size_t usize, loff_t *off) |
| 202 | { |
| 203 | char __user *buf = ubuf; |
| 204 | unsigned next; |
| 205 | int i, err; |
| 206 | |
| 207 | mutex_lock(&mce_chrdev_read_mutex); |
| 208 | |
| 209 | if (!mce_apei_read_done) { |
| 210 | err = __mce_read_apei(&buf, usize); |
| 211 | if (err || buf != ubuf) |
| 212 | goto out; |
| 213 | } |
| 214 | |
| 215 | /* Only supports full reads right now */ |
| 216 | err = -EINVAL; |
| 217 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) |
| 218 | goto out; |
| 219 | |
| 220 | next = mcelog.next; |
| 221 | err = 0; |
| 222 | |
| 223 | for (i = 0; i < next; i++) { |
| 224 | struct mce *m = &mcelog.entry[i]; |
| 225 | |
| 226 | err |= copy_to_user(buf, m, sizeof(*m)); |
| 227 | buf += sizeof(*m); |
| 228 | } |
| 229 | |
| 230 | memset(mcelog.entry, 0, next * sizeof(struct mce)); |
| 231 | mcelog.next = 0; |
| 232 | |
| 233 | if (err) |
| 234 | err = -EFAULT; |
| 235 | |
| 236 | out: |
| 237 | mutex_unlock(&mce_chrdev_read_mutex); |
| 238 | |
| 239 | return err ? err : buf - ubuf; |
| 240 | } |
| 241 | |
| 242 | static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) |
| 243 | { |
| 244 | poll_wait(file, &mce_chrdev_wait, wait); |
| 245 | if (READ_ONCE(mcelog.next)) |
| 246 | return EPOLLIN | EPOLLRDNORM; |
| 247 | if (!mce_apei_read_done && apei_check_mce()) |
| 248 | return EPOLLIN | EPOLLRDNORM; |
| 249 | return 0; |
| 250 | } |
| 251 | |
| 252 | static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, |
| 253 | unsigned long arg) |
| 254 | { |
| 255 | int __user *p = (int __user *)arg; |
| 256 | |
| 257 | if (!capable(CAP_SYS_ADMIN)) |
| 258 | return -EPERM; |
| 259 | |
| 260 | switch (cmd) { |
| 261 | case MCE_GET_RECORD_LEN: |
| 262 | return put_user(sizeof(struct mce), p); |
| 263 | case MCE_GET_LOG_LEN: |
| 264 | return put_user(MCE_LOG_LEN, p); |
| 265 | case MCE_GETCLEAR_FLAGS: { |
| 266 | unsigned flags; |
| 267 | |
| 268 | do { |
| 269 | flags = mcelog.flags; |
| 270 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); |
| 271 | |
| 272 | return put_user(flags, p); |
| 273 | } |
| 274 | default: |
| 275 | return -ENOTTY; |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | void mce_register_injector_chain(struct notifier_block *nb) |
| 280 | { |
| 281 | blocking_notifier_chain_register(&mce_injector_chain, nb); |
| 282 | } |
| 283 | EXPORT_SYMBOL_GPL(mce_register_injector_chain); |
| 284 | |
| 285 | void mce_unregister_injector_chain(struct notifier_block *nb) |
| 286 | { |
| 287 | blocking_notifier_chain_unregister(&mce_injector_chain, nb); |
| 288 | } |
| 289 | EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); |
| 290 | |
| 291 | static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, |
| 292 | size_t usize, loff_t *off) |
| 293 | { |
| 294 | struct mce m; |
| 295 | |
| 296 | if (!capable(CAP_SYS_ADMIN)) |
| 297 | return -EPERM; |
| 298 | /* |
| 299 | * There are some cases where real MSR reads could slip |
| 300 | * through. |
| 301 | */ |
| 302 | if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) |
| 303 | return -EIO; |
| 304 | |
| 305 | if ((unsigned long)usize > sizeof(struct mce)) |
| 306 | usize = sizeof(struct mce); |
| 307 | if (copy_from_user(&m, ubuf, usize)) |
| 308 | return -EFAULT; |
| 309 | |
| 310 | if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) |
| 311 | return -EINVAL; |
| 312 | |
| 313 | /* |
| 314 | * Need to give user space some time to set everything up, |
| 315 | * so do it a jiffie or two later everywhere. |
| 316 | */ |
| 317 | schedule_timeout(2); |
| 318 | |
| 319 | blocking_notifier_call_chain(&mce_injector_chain, 0, &m); |
| 320 | |
| 321 | return usize; |
| 322 | } |
| 323 | |
| 324 | static const struct file_operations mce_chrdev_ops = { |
| 325 | .open = mce_chrdev_open, |
| 326 | .release = mce_chrdev_release, |
| 327 | .read = mce_chrdev_read, |
| 328 | .write = mce_chrdev_write, |
| 329 | .poll = mce_chrdev_poll, |
| 330 | .unlocked_ioctl = mce_chrdev_ioctl, |
| 331 | .llseek = no_llseek, |
| 332 | }; |
| 333 | |
| 334 | static struct miscdevice mce_chrdev_device = { |
| 335 | MISC_MCELOG_MINOR, |
| 336 | "mcelog", |
| 337 | &mce_chrdev_ops, |
| 338 | }; |
| 339 | |
| 340 | static __init int dev_mcelog_init_device(void) |
| 341 | { |
| 342 | int err; |
| 343 | |
| 344 | /* register character device /dev/mcelog */ |
| 345 | err = misc_register(&mce_chrdev_device); |
| 346 | if (err) { |
| 347 | if (err == -EBUSY) |
| 348 | /* Xen dom0 might have registered the device already. */ |
| 349 | pr_info("Unable to init device /dev/mcelog, already registered"); |
| 350 | else |
| 351 | pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); |
| 352 | |
| 353 | return err; |
| 354 | } |
| 355 | |
| 356 | mce_register_decode_chain(&dev_mcelog_nb); |
| 357 | return 0; |
| 358 | } |
| 359 | device_initcall_sync(dev_mcelog_init_device); |