b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | #!/bin/sh |
| 2 | # SPDX-License-Identifier: GPL-2.0-only |
| 3 | |
| 4 | pe_ok() { |
| 5 | local dev="$1" |
| 6 | local path="/sys/bus/pci/devices/$dev/eeh_pe_state" |
| 7 | |
| 8 | # if a driver doesn't support the error handling callbacks then the |
| 9 | # device is recovered by removing and re-probing it. This causes the |
| 10 | # sysfs directory to disappear so read the PE state once and squash |
| 11 | # any potential error messages |
| 12 | local eeh_state="$(cat $path 2>/dev/null)" |
| 13 | if [ -z "$eeh_state" ]; then |
| 14 | return 1; |
| 15 | fi |
| 16 | |
| 17 | local fw_state="$(echo $eeh_state | cut -d' ' -f1)" |
| 18 | local sw_state="$(echo $eeh_state | cut -d' ' -f2)" |
| 19 | |
| 20 | # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an |
| 21 | # error state or being recovered. Either way, not ok. |
| 22 | if [ "$((sw_state & 0x3))" -ne 0 ] ; then |
| 23 | return 1 |
| 24 | fi |
| 25 | |
| 26 | # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and |
| 27 | # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason |
| 28 | # the platform backends set these when the PE is in reset. The |
| 29 | # RECOVERING check above should stop any false positives though. |
| 30 | if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then |
| 31 | return 1 |
| 32 | fi |
| 33 | |
| 34 | return 0; |
| 35 | } |
| 36 | |
| 37 | eeh_supported() { |
| 38 | test -e /proc/powerpc/eeh && \ |
| 39 | grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh |
| 40 | } |
| 41 | |
| 42 | eeh_one_dev() { |
| 43 | local dev="$1" |
| 44 | |
| 45 | # Using this function from the command line is sometimes useful for |
| 46 | # testing so check that the argument is a well-formed sysfs device |
| 47 | # name. |
| 48 | if ! test -e /sys/bus/pci/devices/$dev/ ; then |
| 49 | echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)" |
| 50 | return 1; |
| 51 | fi |
| 52 | |
| 53 | # Break it |
| 54 | echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break |
| 55 | |
| 56 | # Force an EEH device check. If the kernel has already |
| 57 | # noticed the EEH (due to a driver poll or whatever), this |
| 58 | # is a no-op. |
| 59 | echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check |
| 60 | |
| 61 | # Default to a 60s timeout when waiting for a device to recover. This |
| 62 | # is an arbitrary default which can be overridden by setting the |
| 63 | # EEH_MAX_WAIT environmental variable when required. |
| 64 | |
| 65 | # The current record holder for longest recovery time is: |
| 66 | # "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds |
| 67 | max_wait=${EEH_MAX_WAIT:=60} |
| 68 | |
| 69 | for i in `seq 0 ${max_wait}` ; do |
| 70 | if pe_ok $dev ; then |
| 71 | break; |
| 72 | fi |
| 73 | echo "$dev, waited $i/${max_wait}" |
| 74 | sleep 1 |
| 75 | done |
| 76 | |
| 77 | if ! pe_ok $dev ; then |
| 78 | echo "$dev, Failed to recover!" |
| 79 | return 1; |
| 80 | fi |
| 81 | |
| 82 | echo "$dev, Recovered after $i seconds" |
| 83 | return 0; |
| 84 | } |
| 85 | |