diff options
author | Jack Morgenstein <jackm@dev.mellanox.co.il> | 2011-12-13 04:18:30 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-12-13 13:56:08 -0500 |
commit | ab9c17a009ee8eb8c667f22dc0be0709effceab9 (patch) | |
tree | b85acd5d8cf28a42ef5f7361ef43b4195b89912d /drivers/net/ethernet/mellanox/mlx4/main.c | |
parent | d81c7186aa16a0da9e39961af6bad0c855a5d684 (diff) | |
download | blackbird-op-linux-ab9c17a009ee8eb8c667f22dc0be0709effceab9.tar.gz blackbird-op-linux-ab9c17a009ee8eb8c667f22dc0be0709effceab9.zip |
mlx4_core: Modify driver initialization flow to accommodate SRIOV for Ethernet
1. Added module parameters sr_iov and probe_vf for controlling enablement of
SRIOV mode.
2. Increased default max num-qps, num-mpts and log_num_macs to accomodate
SRIOV mode
3. Added port_type_array as a module parameter to allow driver startup with
ports configured as desired.
In SRIOV mode, only ETH is supported, and this array is ignored; otherwise,
for the case where the FW supports both port types (ETH and IB), the
port_type_array parameter is used.
By default, the port_type_array is set to configure both ports as IB.
4. When running in sriov mode, the master needs to initialize the ICM eq table
to hold the eq's for itself and also for all the slaves.
5. mlx4_set_port_mask() now invoked from mlx4_init_hca, instead of in mlx4_dev_cap.
6. Introduced sriov VF (slave) device startup/teardown logic (mainly procedures
mlx4_init_slave, mlx4_slave_exit, mlx4_slave_cap, mlx4_slave_exit and flow
modifications in __mlx4_init_one, mlx4_init_hca, and mlx4_setup_hca).
VFs obtain their startup information from the PF (master) device via the
comm channel.
7. In SRIOV mode (both PF and VF), MSI_X must be enabled, or the driver
aborts loading the device.
8. Do not allow setting port type via sysfs when running in SRIOV mode.
9. mlx4_get_ownership: Currently, only one PF is supported by the driver.
If the HCA is burned with FW which enables more than one PF, only one
of the PFs is allowed to run. The first one up grabs a FW ownership
semaphone -- all other PFs will find that semaphore taken, and the
driver will not allow them to run.
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Liran Liss <liranl@mellanox.co.il>
Signed-off-by: Marcel Apfelbaum <marcela@mellanox.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx4/main.c')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx4/main.c | 842 |
1 files changed, 687 insertions, 155 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 19363b618295..b969bfb569e3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -40,6 +40,7 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/io-mapping.h> +#include <linux/delay.h> #include <linux/mlx4/device.h> #include <linux/mlx4/doorbell.h> @@ -75,6 +76,14 @@ MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #endif /* CONFIG_PCI_MSI */ +static int num_vfs; +module_param(num_vfs, int, 0444); +MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0"); + +static int probe_vf; +module_param(probe_vf, int, 0644); +MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)"); + int mlx4_log_num_mgm_entry_size = 10; module_param_named(log_num_mgm_entry_size, mlx4_log_num_mgm_entry_size, int, 0444); @@ -83,21 +92,26 @@ MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" " 10 gives 248.range: 9<=" " log_num_mgm_entry_size <= 12"); +#define MLX4_VF (1 << 0) + +#define HCA_GLOBAL_CAP_MASK 0 +#define PF_CONTEXT_BEHAVIOUR_MASK 0 + static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static struct mlx4_profile default_profile = { - .num_qp = 1 << 17, + .num_qp = 1 << 18, .num_srq = 1 << 16, .rdmarc_per_qp = 1 << 4, .num_cq = 1 << 16, .num_mcg = 1 << 13, - .num_mpt = 1 << 17, + .num_mpt = 1 << 19, .num_mtt = 1 << 20, }; -static int log_num_mac = 2; +static int log_num_mac = 7; module_param_named(log_num_mac, log_num_mac, int, 0444); MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); @@ -116,6 +130,23 @@ int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); +static int port_type_array[2] = {1, 1}; +static int arr_argc = 2; +module_param_array(port_type_array, int, &arr_argc, 0444); +MODULE_PARM_DESC(port_type_array, "Array of port types: IB by default"); + +struct mlx4_port_config { + struct list_head list; + enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; + struct pci_dev *pdev; +}; + +static inline int mlx4_master_get_num_eqs(struct mlx4_dev *dev) +{ + return dev->caps.reserved_eqs + + MLX4_MFUNC_EQ_NUM * (dev->num_slaves + 1); +} + int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { @@ -200,6 +231,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.trans_code[i] = dev_cap->trans_code[i]; } + dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; dev->caps.bf_reg_size = dev_cap->bf_reg_size; @@ -224,7 +256,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_eqs = dev_cap->reserved_eqs; dev->caps.reserved_mtts = dev_cap->reserved_mtts; dev->caps.reserved_mrws = dev_cap->reserved_mrws; - dev->caps.reserved_uars = dev_cap->reserved_uars; + + /* The first 128 UARs are used for EQ doorbells */ + dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); dev->caps.reserved_pds = dev_cap->reserved_pds; dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? dev_cap->reserved_xrcds : 0; @@ -245,10 +279,36 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.log_num_prios = use_prio ? 3 : 0; for (i = 1; i <= dev->caps.num_ports; ++i) { - if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH) - dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; - else - dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; + if (dev->caps.supported_type[i]) { + /* if only ETH is supported - assign ETH */ + if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH) + dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + /* if only IB is supported, + * assign IB only if SRIOV is off*/ + else if (dev->caps.supported_type[i] == + MLX4_PORT_TYPE_IB) { + if (dev->flags & MLX4_FLAG_SRIOV) + dev->caps.port_type[i] = + MLX4_PORT_TYPE_NONE; + else + dev->caps.port_type[i] = + MLX4_PORT_TYPE_IB; + /* if IB and ETH are supported, + * first of all check if SRIOV is on */ + } else if (dev->flags & MLX4_FLAG_SRIOV) + dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + /* if IB and ETH are supported and SRIOV is off + * use module parameters */ + else { + if (port_type_array[i-1]) + dev->caps.port_type[i] = + MLX4_PORT_TYPE_IB; + else + dev->caps.port_type[i] = + MLX4_PORT_TYPE_ETH; + } + } dev->caps.possible_type[i] = dev->caps.port_type[i]; mlx4_priv(dev)->sense.sense_allowed[i] = dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO; @@ -267,8 +327,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) } } - mlx4_set_port_mask(dev); - dev->caps.max_counters = 1 << ilog2(dev_cap->max_counters); dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; @@ -287,6 +345,149 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) return 0; } +/*The function checks if there are live vf, return the num of them*/ +static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i; + int ret = 0; + + for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + if (s_state->active && s_state->last_cmd != + MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "%s: slave: %d is still active\n", + __func__, i); + ret++; + } + } + return ret; +} + +static int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave; + + if (!mlx4_is_master(dev)) + return 0; + + s_slave = &priv->mfunc.master.slave_state[slave]; + return !!s_slave->active; +} +EXPORT_SYMBOL(mlx4_is_slave_active); + +static int mlx4_slave_cap(struct mlx4_dev *dev) +{ + int err; + u32 page_size; + struct mlx4_dev_cap dev_cap; + struct mlx4_func_cap func_cap; + struct mlx4_init_hca_param hca_param; + int i; + + memset(&hca_param, 0, sizeof(hca_param)); + err = mlx4_QUERY_HCA(dev, &hca_param); + if (err) { + mlx4_err(dev, "QUERY_HCA command failed, aborting.\n"); + return err; + } + + /*fail if the hca has an unknown capability */ + if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) != + HCA_GLOBAL_CAP_MASK) { + mlx4_err(dev, "Unknown hca global capabilities\n"); + return -ENOSYS; + } + + mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; + + memset(&dev_cap, 0, sizeof(dev_cap)); + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); + return err; + } + + page_size = ~dev->caps.page_size_cap + 1; + mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); + if (page_size > PAGE_SIZE) { + mlx4_err(dev, "HCA minimum page size of %d bigger than " + "kernel PAGE_SIZE of %ld, aborting.\n", + page_size, PAGE_SIZE); + return -ENODEV; + } + + /* slave gets uar page size from QUERY_HCA fw command */ + dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12); + + /* TODO: relax this assumption */ + if (dev->caps.uar_page_size != PAGE_SIZE) { + mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %ld\n", + dev->caps.uar_page_size, PAGE_SIZE); + return -ENODEV; + } + + memset(&func_cap, 0, sizeof(func_cap)); + err = mlx4_QUERY_FUNC_CAP(dev, &func_cap); + if (err) { + mlx4_err(dev, "QUERY_FUNC_CAP command failed, aborting.\n"); + return err; + } + + if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) != + PF_CONTEXT_BEHAVIOUR_MASK) { + mlx4_err(dev, "Unknown pf context behaviour\n"); + return -ENOSYS; + } + + dev->caps.function = func_cap.function; + dev->caps.num_ports = func_cap.num_ports; + dev->caps.num_qps = func_cap.qp_quota; + dev->caps.num_srqs = func_cap.srq_quota; + dev->caps.num_cqs = func_cap.cq_quota; + dev->caps.num_eqs = func_cap.max_eq; + dev->caps.reserved_eqs = func_cap.reserved_eq; + dev->caps.num_mpts = func_cap.mpt_quota; + dev->caps.num_mtts = func_cap.mtt_quota; + dev->caps.num_pds = MLX4_NUM_PDS; + dev->caps.num_mgms = 0; + dev->caps.num_amgms = 0; + + for (i = 1; i <= dev->caps.num_ports; ++i) + dev->caps.port_mask[i] = dev->caps.port_type[i]; + + if (dev->caps.num_ports > MLX4_MAX_PORTS) { + mlx4_err(dev, "HCA has %d ports, but we only support %d, " + "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); + return -ENODEV; + } + + if (dev->caps.uar_page_size * (dev->caps.num_uars - + dev->caps.reserved_uars) > + pci_resource_len(dev->pdev, 2)) { + mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than " + "PCI resource 2 size of 0x%llx, aborting.\n", + dev->caps.uar_page_size * dev->caps.num_uars, + (unsigned long long) pci_resource_len(dev->pdev, 2)); + return -ENODEV; + } + +#if 0 + mlx4_warn(dev, "sqp_demux:%d\n", dev->caps.sqp_demux); + mlx4_warn(dev, "num_uars:%d reserved_uars:%d uar region:0x%x bar2:0x%llx\n", + dev->caps.num_uars, dev->caps.reserved_uars, + dev->caps.uar_page_size * dev->caps.num_uars, + pci_resource_len(dev->pdev, 2)); + mlx4_warn(dev, "num_eqs:%d reserved_eqs:%d\n", dev->caps.num_eqs, + dev->caps.reserved_eqs); + mlx4_warn(dev, "num_pds:%d reserved_pds:%d slave_pd_shift:%d pd_base:%d\n", + dev->caps.num_pds, dev->caps.reserved_pds, + dev->caps.slave_pd_shift, dev->caps.pd_base); +#endif + return 0; +} /* * Change the port configuration of the device. @@ -456,6 +657,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, { struct mlx4_priv *priv = mlx4_priv(dev); int err; + int num_eqs; err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, cmpt_base + @@ -485,12 +687,14 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, if (err) goto err_srq; + num_eqs = (mlx4_is_master(dev)) ? + roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), - cmpt_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, 0, 0); + cmpt_entry_sz, num_eqs, num_eqs, 0, 0); if (err) goto err_cq; @@ -514,6 +718,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; + int num_eqs; int err; err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); @@ -545,10 +750,13 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_aux; } + + num_eqs = (mlx4_is_master(dev)) ? + roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, - 0, 0); + num_eqs, num_eqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; @@ -732,6 +940,16 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_free_icm(dev, priv->fw.aux_icm, 0); } +static void mlx4_slave_exit(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + down(&priv->cmd.slave_sem); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + mlx4_warn(dev, "Failed to close slave function.\n"); + up(&priv->cmd.slave_sem); +} + static int map_bf_area(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -739,8 +957,10 @@ static int map_bf_area(struct mlx4_dev *dev) resource_size_t bf_len; int err = 0; - bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT); - bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT); + bf_start = pci_resource_start(dev->pdev, 2) + + (dev->caps.num_uars << PAGE_SHIFT); + bf_len = pci_resource_len(dev->pdev, 2) - + (dev->caps.num_uars << PAGE_SHIFT); priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); if (!priv->bf_mapping) err = -ENOMEM; @@ -757,10 +977,81 @@ static void unmap_bf_area(struct mlx4_dev *dev) static void mlx4_close_hca(struct mlx4_dev *dev) { unmap_bf_area(dev); - mlx4_CLOSE_HCA(dev, 0); - mlx4_free_icms(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else { + mlx4_CLOSE_HCA(dev, 0); + mlx4_free_icms(dev); + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + } +} + +static int mlx4_init_slave(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 dma = (u64) priv->mfunc.vhcr_dma; + int num_of_reset_retries = NUM_OF_RESET_RETRIES; + int ret_from_reset = 0; + u32 slave_read; + u32 cmd_channel_ver; + + down(&priv->cmd.slave_sem); + priv->cmd.max_cmds = 1; + mlx4_warn(dev, "Sending reset\n"); + ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, + MLX4_COMM_TIME); + /* if we are in the middle of flr the slave will try + * NUM_OF_RESET_RETRIES times before leaving.*/ + if (ret_from_reset) { + if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) { + msleep(SLEEP_TIME_IN_RESET); + while (ret_from_reset && num_of_reset_retries) { + mlx4_warn(dev, "slave is currently in the" + "middle of FLR. retrying..." + "(try num:%d)\n", + (NUM_OF_RESET_RETRIES - + num_of_reset_retries + 1)); + ret_from_reset = + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, + 0, MLX4_COMM_TIME); + num_of_reset_retries = num_of_reset_retries - 1; + } + } else + goto err; + } + + /* check the driver version - the slave I/F revision + * must match the master's */ + slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); + cmd_channel_ver = mlx4_comm_get_version(); + + if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != + MLX4_COMM_GET_IF_REV(slave_read)) { + mlx4_err(dev, "slave driver version is not supported" + " by the master\n"); + goto err; + } + + mlx4_warn(dev, "Sending vhcr0\n"); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) + goto err; + up(&priv->cmd.slave_sem); + return 0; + +err: + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); + up(&priv->cmd.slave_sem); + return -EIO; } static int mlx4_init_hca(struct mlx4_dev *dev) @@ -774,56 +1065,76 @@ static int mlx4_init_hca(struct mlx4_dev *dev) u64 icm_size; int err; - err = mlx4_QUERY_FW(dev); - if (err) { - if (err == -EACCES) - mlx4_info(dev, "non-primary physical function, skipping.\n"); - else - mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); - return err; - } + if (!mlx4_is_slave(dev)) { + err = mlx4_QUERY_FW(dev); + if (err) { + if (err == -EACCES) + mlx4_info(dev, "non-primary physical function, skipping.\n"); + else + mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); + goto unmap_bf; + } - err = mlx4_load_fw(dev); - if (err) { - mlx4_err(dev, "Failed to start FW, aborting.\n"); - return err; - } + err = mlx4_load_fw(dev); + if (err) { + mlx4_err(dev, "Failed to start FW, aborting.\n"); + goto unmap_bf; + } - mlx4_cfg.log_pg_sz_m = 1; - mlx4_cfg.log_pg_sz = 0; - err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); - if (err) - mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); + mlx4_cfg.log_pg_sz_m = 1; + mlx4_cfg.log_pg_sz = 0; + err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); + if (err) + mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); - err = mlx4_dev_cap(dev, &dev_cap); - if (err) { - mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); - goto err_stop_fw; - } + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); + goto err_stop_fw; + } - profile = default_profile; + profile = default_profile; - icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca); - if ((long long) icm_size < 0) { - err = icm_size; - goto err_stop_fw; - } + icm_size = mlx4_make_profile(dev, &profile, &dev_cap, + &init_hca); + if ((long long) icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } - if (map_bf_area(dev)) - mlx4_dbg(dev, "Failed to map blue flame area\n"); + init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + init_hca.uar_page_sz = PAGE_SHIFT - 12; - init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); + if (err) + goto err_stop_fw; - err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); - if (err) - goto err_stop_fw; + err = mlx4_INIT_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); + goto err_free_icm; + } + } else { + err = mlx4_init_slave(dev); + if (err) { + mlx4_err(dev, "Failed to initialize slave\n"); + goto unmap_bf; + } - err = mlx4_INIT_HCA(dev, &init_hca); - if (err) { - mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); - goto err_free_icm; + err = mlx4_slave_cap(dev); + if (err) { + mlx4_err(dev, "Failed to obtain slave caps\n"); + goto err_close; + } } + if (map_bf_area(dev)) + mlx4_dbg(dev, "Failed to map blue flame area\n"); + + /*Only the master set the ports, all the rest got it from it.*/ + if (!mlx4_is_slave(dev)) + mlx4_set_port_mask(dev); + err = mlx4_QUERY_ADAPTER(dev, &adapter); if (err) { mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n"); @@ -836,16 +1147,19 @@ static int mlx4_init_hca(struct mlx4_dev *dev) return 0; err_close: - mlx4_CLOSE_HCA(dev, 0); + mlx4_close_hca(dev); err_free_icm: - mlx4_free_icms(dev); + if (!mlx4_is_slave(dev)) + mlx4_free_icms(dev); err_stop_fw: + if (!mlx4_is_slave(dev)) { + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, priv->fw.fw_icm, 0); + } +unmap_bf: unmap_bf_area(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, priv->fw.fw_icm, 0); - return err; } @@ -992,55 +1306,62 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_srq_table_free; } - err = mlx4_init_mcg_table(dev); - if (err) { - mlx4_err(dev, "Failed to initialize " - "multicast group table, aborting.\n"); - goto err_qp_table_free; + if (!mlx4_is_slave(dev)) { + err = mlx4_init_mcg_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize " + "multicast group table, aborting.\n"); + goto err_qp_table_free; + } } err = mlx4_init_counters_table(dev); if (err && err != -ENOENT) { mlx4_err(dev, "Failed to initialize counters table, aborting.\n"); - goto err_counters_table_free; + goto err_mcg_table_free; } - for (port = 1; port <= dev->caps.num_ports; port++) { - enum mlx4_port_type port_type = 0; - mlx4_SENSE_PORT(dev, port, &port_type); - if (port_type) - dev->caps.port_type[port] = port_type; - ib_port_default_caps = 0; - err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); - if (err) - mlx4_warn(dev, "failed to get port %d default " - "ib capabilities (%d). Continuing with " - "caps = 0\n", port, err); - dev->caps.ib_port_def_cap[port] = ib_port_default_caps; - - err = mlx4_check_ext_port_caps(dev, port); - if (err) - mlx4_warn(dev, "failed to get port %d extended " - "port capabilities support info (%d)." - " Assuming not supported\n", port, err); + if (!mlx4_is_slave(dev)) { + for (port = 1; port <= dev->caps.num_ports; port++) { + if (!mlx4_is_mfunc(dev)) { + enum mlx4_port_type port_type = 0; + mlx4_SENSE_PORT(dev, port, &port_type); + if (port_type) + dev->caps.port_type[port] = port_type; + } + ib_port_default_caps = 0; + err = mlx4_get_port_ib_caps(dev, port, + &ib_port_default_caps); + if (err) + mlx4_warn(dev, "failed to get port %d default " + "ib capabilities (%d). Continuing " + "with caps = 0\n", port, err); + dev->caps.ib_port_def_cap[port] = ib_port_default_caps; + + err = mlx4_check_ext_port_caps(dev, port); + if (err) + mlx4_warn(dev, "failed to get port %d extended " + "port capabilities support info (%d)." + " Assuming not supported\n", + port, err); - err = mlx4_SET_PORT(dev, port); - if (err) { - mlx4_err(dev, "Failed to set port %d, aborting\n", - port); - goto err_mcg_table_free; + err = mlx4_SET_PORT(dev, port); + if (err) { + mlx4_err(dev, "Failed to set port %d, aborting\n", + port); + goto err_counters_table_free; + } } } - mlx4_set_port_mask(dev); return 0; -err_mcg_table_free: - mlx4_cleanup_mcg_table(dev); - err_counters_table_free: mlx4_cleanup_counters_table(dev); +err_mcg_table_free: + mlx4_cleanup_mcg_table(dev); + err_qp_table_free: mlx4_cleanup_qp_table(dev); @@ -1087,8 +1408,16 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) int i; if (msi_x) { - nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, - nreq); + /* In multifunction mode each function gets 2 msi-X vectors + * one for data path completions anf the other for asynch events + * or command completions */ + if (mlx4_is_mfunc(dev)) { + nreq = 2; + } else { + nreq = min_t(int, dev->caps.num_eqs - + dev->caps.reserved_eqs, nreq); + } + entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; @@ -1144,16 +1473,24 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) info->dev = dev; info->port = port; - mlx4_init_mac_table(dev, &info->mac_table); - mlx4_init_vlan_table(dev, &info->vlan_table); - info->base_qpn = dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + + if (!mlx4_is_slave(dev)) { + INIT_RADIX_TREE(&info->mac_tree, GFP_KERNEL); + mlx4_init_mac_table(dev, &info->mac_table); + mlx4_init_vlan_table(dev, &info->vlan_table); + info->base_qpn = + dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + (port - 1) * (1 << log_num_mac); + } sprintf(info->dev_name, "mlx4_port%d", port); info->port_attr.attr.name = info->dev_name; - info->port_attr.attr.mode = S_IRUGO | S_IWUSR; + if (mlx4_is_mfunc(dev)) + info->port_attr.attr.mode = S_IRUGO; + else { + info->port_attr.attr.mode = S_IRUGO | S_IWUSR; + info->port_attr.store = set_port_type; + } info->port_attr.show = show_port_type; - info->port_attr.store = set_port_type; sysfs_attr_init(&info->port_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_attr); @@ -1226,6 +1563,46 @@ static void mlx4_clear_steering(struct mlx4_dev *dev) kfree(priv->steer); } +static int extended_func_num(struct pci_dev *pdev) +{ + return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); +} + +#define MLX4_OWNER_BASE 0x8069c +#define MLX4_OWNER_SIZE 4 + +static int mlx4_get_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + u32 ret; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return -ENOMEM; + } + + ret = readl(owner); + iounmap(owner); + return (int) !!ret; +} + +static void mlx4_free_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return; + } + writel(0, owner); + msleep(1000); + iounmap(owner); +} + static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx4_priv *priv; @@ -1241,13 +1618,20 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) "aborting.\n"); return err; } - + if (num_vfs > MLX4_MAX_NUM_VF) { + printk(KERN_ERR "There are more VF's (%d) than allowed(%d)\n", + num_vfs, MLX4_MAX_NUM_VF); + return -EINVAL; + } /* - * Check for BARs. We expect 0: 1MB + * Check for BARs. */ - if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || - pci_resource_len(pdev, 0) != 1 << 20) { - dev_err(&pdev->dev, "Missing DCS, aborting.\n"); + if (((id == NULL) || !(id->driver_data & MLX4_VF)) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting." + "(id == 0X%p, id->driver_data: 0x%lx," + " pci_resource_flags(pdev, 0):0x%lx)\n", id, + id ? id->driver_data : 0, pci_resource_flags(pdev, 0)); err = -ENODEV; goto err_disable_pdev; } @@ -1311,42 +1695,132 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) mutex_init(&priv->bf_mutex); dev->rev_id = pdev->revision; + /* Detect if this device is a virtual function */ + if (id && id->driver_data & MLX4_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. */ + if (num_vfs && extended_func_num(pdev) > probe_vf) { + mlx4_warn(dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_free_dev; + } + mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); + dev->flags |= MLX4_FLAG_SLAVE; + } else { + /* We reset the device and enable SRIOV only for physical + * devices. Try to claim ownership on the device; + * if already taken, skip -- do not allow multiple PFs */ + err = mlx4_get_ownership(dev); + if (err) { + if (err < 0) + goto err_free_dev; + else { + mlx4_warn(dev, "Multiple PFs not yet supported." + " Skipping PF.\n"); + err = -EINVAL; + goto err_free_dev; + } + } - /* - * Now reset the HCA before we touch the PCI capabilities or - * attempt a firmware command, since a boot ROM may have left - * the HCA in an undefined state. - */ - err = mlx4_reset(dev); - if (err) { - mlx4_err(dev, "Failed to reset HCA, aborting.\n"); - goto err_free_dev; + if (num_vfs) { + mlx4_warn(dev, "Enabling sriov with:%d vfs\n", num_vfs); + err = pci_enable_sriov(pdev, num_vfs); + if (err) { + mlx4_err(dev, "Failed to enable sriov," + "continuing without sriov enabled" + " (err = %d).\n", err); + num_vfs = 0; + err = 0; + } else { + mlx4_warn(dev, "Running in master mode\n"); + dev->flags |= MLX4_FLAG_SRIOV | + MLX4_FLAG_MASTER; + dev->num_vfs = num_vfs; + } + } + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mlx4_reset(dev); + if (err) { + mlx4_err(dev, "Failed to reset HCA, aborting.\n"); + goto err_rel_own; + } } +slave_start: if (mlx4_cmd_init(dev)) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); - goto err_free_dev; + goto err_sriov; + } + + /* In slave functions, the communication channel must be initialized + * before posting commands. Also, init num_slaves before calling + * mlx4_init_hca */ + if (mlx4_is_mfunc(dev)) { + if (mlx4_is_master(dev)) + dev->num_slaves = MLX4_MAX_NUM_SLAVES; + else { + dev->num_slaves = 0; + if (mlx4_multi_func_init(dev)) { + mlx4_err(dev, "Failed to init slave mfunc" + " interface, aborting.\n"); + goto err_cmd; + } + } } err = mlx4_init_hca(dev); - if (err) - goto err_cmd; + if (err) { + if (err == -EACCES) { + /* Not primary Physical function + * Running in slave mode */ + mlx4_cmd_cleanup(dev); + dev->flags |= MLX4_FLAG_SLAVE; + dev->flags &= ~MLX4_FLAG_MASTER; + goto slave_start; + } else + goto err_mfunc; + } + + /* In master functions, the communication channel must be initialized + * after obtaining its address from fw */ + if (mlx4_is_master(dev)) { + if (mlx4_multi_func_init(dev)) { + mlx4_err(dev, "Failed to init master mfunc" + "interface, aborting.\n"); + goto err_close; + } + } err = mlx4_alloc_eq_table(dev); if (err) - goto err_close; + goto err_master_mfunc; priv->msix_ctl.pool_bm = 0; spin_lock_init(&priv->msix_ctl.pool_lock); mlx4_enable_msi_x(dev); - - err = mlx4_init_steering(dev); - if (err) + if ((mlx4_is_mfunc(dev)) && + !(dev->flags & MLX4_FLAG_MSI_X)) { + mlx4_err(dev, "INTx is not supported in multi-function mode." + " aborting.\n"); goto err_free_eq; + } + + if (!mlx4_is_slave(dev)) { + err = mlx4_init_steering(dev); + if (err) + goto err_free_eq; + } err = mlx4_setup_hca(dev); - if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) { + if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && + !mlx4_is_mfunc(dev)) { dev->flags &= ~MLX4_FLAG_MSI_X; pci_disable_msix(pdev); err = mlx4_setup_hca(dev); @@ -1389,20 +1863,37 @@ err_port: mlx4_cleanup_uar_table(dev); err_steer: - mlx4_clear_steering(dev); + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); err_free_eq: mlx4_free_eq_table(dev); +err_master_mfunc: + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); mlx4_close_hca(dev); +err_mfunc: + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); + err_cmd: mlx4_cmd_cleanup(dev); +err_sriov: + if (num_vfs && (dev->flags & MLX4_FLAG_SRIOV)) + pci_disable_sriov(pdev); + +err_rel_own: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + err_free_dev: kfree(priv); @@ -1430,6 +1921,12 @@ static void mlx4_remove_one(struct pci_dev *pdev) int p; if (dev) { + /* in SRIOV it is not allowed to unload the pf's + * driver while there are alive vf's */ + if (mlx4_is_master(dev)) { + if (mlx4_how_many_lives_vf(dev)) + printk(KERN_ERR "Removing PF when there are assigned VF's !!!\n"); + } mlx4_stop_sense(dev); mlx4_unregister_device(dev); @@ -1449,17 +1946,31 @@ static void mlx4_remove_one(struct pci_dev *pdev) mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev); + iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); - mlx4_clear_steering(dev); + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); mlx4_free_eq_table(dev); + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); mlx4_close_hca(dev); + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); mlx4_cmd_cleanup(dev); if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); + if (num_vfs && (dev->flags & MLX4_FLAG_SRIOV)) { + mlx4_warn(dev, "Disabling sriov\n"); + pci_disable_sriov(pdev); + } + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); kfree(priv); pci_release_regions(pdev); pci_disable_device(pdev); @@ -1474,33 +1985,48 @@ int mlx4_restart_one(struct pci_dev *pdev) } static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { - { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */ - { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */ - { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */ - { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2*/ - { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ - { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX2 40GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x1002) }, /* MT25400 Family [ConnectX-2 Virtual Function] */ - { PCI_VDEVICE(MELLANOX, 0x1003) }, /* MT27500 Family [ConnectX-3] */ - { PCI_VDEVICE(MELLANOX, 0x1004) }, /* MT27500 Family [ConnectX-3 Virtual Function] */ - { PCI_VDEVICE(MELLANOX, 0x1005) }, /* MT27510 Family */ - { PCI_VDEVICE(MELLANOX, 0x1006) }, /* MT27511 Family */ - { PCI_VDEVICE(MELLANOX, 0x1007) }, /* MT27520 Family */ - { PCI_VDEVICE(MELLANOX, 0x1008) }, /* MT27521 Family */ - { PCI_VDEVICE(MELLANOX, 0x1009) }, /* MT27530 Family */ - { PCI_VDEVICE(MELLANOX, 0x100a) }, /* MT27531 Family */ - { PCI_VDEVICE(MELLANOX, 0x100b) }, /* MT27540 Family */ - { PCI_VDEVICE(MELLANOX, 0x100c) }, /* MT27541 Family */ - { PCI_VDEVICE(MELLANOX, 0x100d) }, /* MT27550 Family */ - { PCI_VDEVICE(MELLANOX, 0x100e) }, /* MT27551 Family */ - { PCI_VDEVICE(MELLANOX, 0x100f) }, /* MT27560 Family */ - { PCI_VDEVICE(MELLANOX, 0x1010) }, /* MT27561 Family */ + /* MT25408 "Hermon" SDR */ + { PCI_VDEVICE(MELLANOX, 0x6340), 0 }, + /* MT25408 "Hermon" DDR */ + { PCI_VDEVICE(MELLANOX, 0x634a), 0 }, + /* MT25408 "Hermon" QDR */ + { PCI_VDEVICE(MELLANOX, 0x6354), 0 }, + /* MT25408 "Hermon" DDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6732), 0 }, + /* MT25408 "Hermon" QDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x673c), 0 }, + /* MT25408 "Hermon" EN 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6368), 0 }, + /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6750), 0 }, + /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6372), 0 }, + /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x675a), 0 }, + /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + { PCI_VDEVICE(MELLANOX, 0x6764), 0 }, + /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + { PCI_VDEVICE(MELLANOX, 0x6746), 0 }, + /* MT26478 ConnectX2 40GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x676e), 0 }, + /* MT25400 Family [ConnectX-2 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1002), MLX4_VF }, + /* MT27500 Family [ConnectX-3] */ + { PCI_VDEVICE(MELLANOX, 0x1003), 0 }, + /* MT27500 Family [ConnectX-3 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1004), MLX4_VF }, + { PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */ + { PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */ + { PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */ + { PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */ + { PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */ + { PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */ + { PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */ + { PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */ + { PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */ + { PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */ + { PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */ + { PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */ { 0, } }; @@ -1529,6 +2055,12 @@ static int __init mlx4_verify_params(void) return -1; } + /* Check if module param for ports type has legal combination */ + if (port_type_array[0] == false && port_type_array[1] == true) { + printk(KERN_WARNING "Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n"); + port_type_array[0] = true; + } + return 0; } |