diff options
author | Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> | 2012-07-20 09:55:43 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-25 15:24:13 -0700 |
commit | 57dbf29a54bda5773f9ed1d00e3cc633294259da (patch) | |
tree | 311f1973354a10fa997db812299982b560dc821f /drivers/net/ethernet/mellanox/mlx4/main.c | |
parent | f94898ea6682977f15c5a8f9ffb293a14f95455a (diff) | |
download | blackbird-obmc-linux-57dbf29a54bda5773f9ed1d00e3cc633294259da.tar.gz blackbird-obmc-linux-57dbf29a54bda5773f9ed1d00e3cc633294259da.zip |
mlx4: Add support for EEH error recovery
Currently the mlx4 drivers don't have the necessary callbacks to
implement EEH errors detection and recovery, so the PCI layer uses the
probe and remove callbacks to try to recover the device after an error on
the bus. However, these callbacks have race conditions with the internal
catastrophic error recovery functions, which will also detect the error
and this can cause the system to crash if both EEH and catas functions
try to reset the device.
This patch adds the necessary error recovery callbacks and makes sure
that the internal catastrophic error functions will not try to reset the
device in such scenarios. It also adds some calls to
pci_channel_offline() to suppress reads/writes on the bus when the slot
cannot accept I/O operations so we prevent unnecessary accesses to the
bus and speed up the device removal.
Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Acked-by: Shlomo Pongratz <shlomop@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx4/main.c')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx4/main.c | 30 |
1 files changed, 29 insertions, 1 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 42645166bae2..e717091734d0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1775,6 +1775,9 @@ static int mlx4_get_ownership(struct mlx4_dev *dev) void __iomem *owner; u32 ret; + if (pci_channel_offline(dev->pdev)) + return -EIO; + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { @@ -1791,6 +1794,9 @@ static void mlx4_free_ownership(struct mlx4_dev *dev) { void __iomem *owner; + if (pci_channel_offline(dev->pdev)) + return; + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { @@ -2237,11 +2243,33 @@ static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { MODULE_DEVICE_TABLE(pci, mlx4_pci_table); +static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + mlx4_remove_one(pdev); + + return state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) +{ + int ret = __mlx4_init_one(pdev, NULL); + + return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +static struct pci_error_handlers mlx4_err_handler = { + .error_detected = mlx4_pci_err_detected, + .slot_reset = mlx4_pci_slot_reset, +}; + static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, - .remove = __devexit_p(mlx4_remove_one) + .remove = __devexit_p(mlx4_remove_one), + .err_handler = &mlx4_err_handler, }; static int __init mlx4_verify_params(void) |