Merge: s390/pci: Don't abort recovery for user-space drivers

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7267

JIRA: https://issues.redhat.com/browse/RHEL-110235

commit 62355f1f87b8c7f8785a8dd3cd5ca6e5b513566a
Author: Niklas Schnelle <schnelle@linux.ibm.com>
Date:   Wed Jun 25 11:28:30 2025 +0200

    s390/pci: Allow automatic recovery with minimal driver support

    According to Documentation/PCI/pci-error-recovery.rst only the
    error_detected() callback in the err_handler struct is mandatory for
    a driver to support error recovery. So far s390's error recovery chose
    a stricter approach also requiring slot_reset() and resume().

    Relax this requirement and only require error_detected(). If a callback
    is not implemented EEH and AER treat this as PCI_ERS_RESULT_NONE. This
    return value is otherwise used by drivers abstaining from their vote
    on how to proceed with recovery and currently also not supported by
    s390's recovery code.

    So to support missing callbacks in-line with other implementors of the
    recovery flow, also handle PCI_ERS_RESULT_NONE. Since s390 only does per
    PCI function recovery and does not do voting, treat PCI_ERS_RESULT_NONE
    optimistically and proceed through recovery unless other failures
    prevent this.

    Reviewed-by: Farhan Ali <alifm@linux.ibm.com>
    Reviewed-by: Julian Ruess <julianr@linux.ibm.com>
    Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
    Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>

Signed-off-by: Mete Durlu <mdurlu@redhat.com>

Approved-by: Steve Best <sbest@redhat.com>
Approved-by: Tony Camuso <tcamuso@redhat.com>
Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com>

Merged-by: Patrick Talbert <ptalbert@redhat.com>
This commit is contained in:
Patrick Talbert 2025-09-06 10:41:44 -04:00
commit df627227d8
1 changed files with 29 additions and 15 deletions

View File

@ -54,6 +54,7 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res)
case PCI_ERS_RESULT_CAN_RECOVER: case PCI_ERS_RESULT_CAN_RECOVER:
case PCI_ERS_RESULT_RECOVERED: case PCI_ERS_RESULT_RECOVERED:
case PCI_ERS_RESULT_NEED_RESET: case PCI_ERS_RESULT_NEED_RESET:
case PCI_ERS_RESULT_NONE:
return false; return false;
default: default:
return true; return true;
@ -78,10 +79,6 @@ static bool is_driver_supported(struct pci_driver *driver)
return false; return false;
if (!driver->err_handler->error_detected) if (!driver->err_handler->error_detected)
return false; return false;
if (!driver->err_handler->slot_reset)
return false;
if (!driver->err_handler->resume)
return false;
return true; return true;
} }
@ -118,16 +115,18 @@ static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev,
return PCI_ERS_RESULT_NEED_RESET; return PCI_ERS_RESULT_NEED_RESET;
} }
if (driver->err_handler->mmio_enabled) { if (driver->err_handler->mmio_enabled)
ers_res = driver->err_handler->mmio_enabled(pdev); ers_res = driver->err_handler->mmio_enabled(pdev);
if (ers_result_indicates_abort(ers_res)) { else
pr_info("%s: Automatic recovery failed after MMIO re-enable\n", ers_res = PCI_ERS_RESULT_NONE;
pci_name(pdev));
return ers_res; if (ers_result_indicates_abort(ers_res)) {
} else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { pr_info("%s: Automatic recovery failed after MMIO re-enable\n",
pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); pci_name(pdev));
return ers_res; return ers_res;
} } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) {
pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
return ers_res;
} }
pr_debug("%s: Unblocking DMA\n", pci_name(pdev)); pr_debug("%s: Unblocking DMA\n", pci_name(pdev));
@ -154,7 +153,12 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
return ers_res; return ers_res;
} }
pdev->error_state = pci_channel_io_normal; pdev->error_state = pci_channel_io_normal;
ers_res = driver->err_handler->slot_reset(pdev);
if (driver->err_handler->slot_reset)
ers_res = driver->err_handler->slot_reset(pdev);
else
ers_res = PCI_ERS_RESULT_NONE;
if (ers_result_indicates_abort(ers_res)) { if (ers_result_indicates_abort(ers_res)) {
pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev)); pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev));
return ers_res; return ers_res;
@ -218,7 +222,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
goto out_unlock; goto out_unlock;
} }
if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
ers_res = zpci_event_do_error_state_clear(pdev, driver); ers_res = zpci_event_do_error_state_clear(pdev, driver);
if (ers_result_indicates_abort(ers_res)) { if (ers_result_indicates_abort(ers_res)) {
status_str = "failed (abort on MMIO enable)"; status_str = "failed (abort on MMIO enable)";
@ -229,6 +233,16 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
if (ers_res == PCI_ERS_RESULT_NEED_RESET) if (ers_res == PCI_ERS_RESULT_NEED_RESET)
ers_res = zpci_event_do_reset(pdev, driver); ers_res = zpci_event_do_reset(pdev, driver);
/*
* ers_res can be PCI_ERS_RESULT_NONE either because the driver
* decided to return it, indicating that it abstains from voting
* on how to recover, or because it didn't implement the callback.
* Both cases assume, that if there is nothing else causing a
* disconnect, we recovered successfully.
*/
if (ers_res == PCI_ERS_RESULT_NONE)
ers_res = PCI_ERS_RESULT_RECOVERED;
if (ers_res != PCI_ERS_RESULT_RECOVERED) { if (ers_res != PCI_ERS_RESULT_RECOVERED) {
pr_err("%s: Automatic recovery failed; operator intervention is required\n", pr_err("%s: Automatic recovery failed; operator intervention is required\n",
pci_name(pdev)); pci_name(pdev));