root/drivers/net/ethernet/mellanox/mlx4/catas.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. read_vendor_id
  2. mlx4_reset_master
  3. mlx4_reset_slave
  4. mlx4_comm_internal_err
  5. mlx4_enter_error_state
  6. mlx4_handle_error_state
  7. dump_err_buf
  8. poll_catas
  9. catas_reset
  10. mlx4_start_catas_poll
  11. mlx4_stop_catas_poll
  12. mlx4_catas_init
  13. mlx4_catas_end

   1 /*
   2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
   3  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
   4  *
   5  * This software is available to you under a choice of one of two
   6  * licenses.  You may choose to be licensed under the terms of the GNU
   7  * General Public License (GPL) Version 2, available from the file
   8  * COPYING in the main directory of this source tree, or the
   9  * OpenIB.org BSD license below:
  10  *
  11  *     Redistribution and use in source and binary forms, with or
  12  *     without modification, are permitted provided that the following
  13  *     conditions are met:
  14  *
  15  *      - Redistributions of source code must retain the above
  16  *        copyright notice, this list of conditions and the following
  17  *        disclaimer.
  18  *
  19  *      - Redistributions in binary form must reproduce the above
  20  *        copyright notice, this list of conditions and the following
  21  *        disclaimer in the documentation and/or other materials
  22  *        provided with the distribution.
  23  *
  24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31  * SOFTWARE.
  32  */
  33 
  34 #include <linux/workqueue.h>
  35 #include <linux/module.h>
  36 
  37 #include "mlx4.h"
  38 
  39 enum {
  40         MLX4_CATAS_POLL_INTERVAL        = 5 * HZ,
  41 };
  42 
  43 
  44 
  45 int mlx4_internal_err_reset = 1;
  46 module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
  47 MODULE_PARM_DESC(internal_err_reset,
  48                  "Reset device on internal errors if non-zero (default 1)");
  49 
  50 static int read_vendor_id(struct mlx4_dev *dev)
  51 {
  52         u16 vendor_id = 0;
  53         int ret;
  54 
  55         ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
  56         if (ret) {
  57                 mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
  58                 return ret;
  59         }
  60 
  61         if (vendor_id == 0xffff) {
  62                 mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
  63                 return -EINVAL;
  64         }
  65 
  66         return 0;
  67 }
  68 
  69 static int mlx4_reset_master(struct mlx4_dev *dev)
  70 {
  71         int err = 0;
  72 
  73         if (mlx4_is_master(dev))
  74                 mlx4_report_internal_err_comm_event(dev);
  75 
  76         if (!pci_channel_offline(dev->persist->pdev)) {
  77                 err = read_vendor_id(dev);
  78                 /* If PCI can't be accessed to read vendor ID we assume that its
  79                  * link was disabled and chip was already reset.
  80                  */
  81                 if (err)
  82                         return 0;
  83 
  84                 err = mlx4_reset(dev);
  85                 if (err)
  86                         mlx4_err(dev, "Fail to reset HCA\n");
  87         }
  88 
  89         return err;
  90 }
  91 
  92 static int mlx4_reset_slave(struct mlx4_dev *dev)
  93 {
  94 #define COM_CHAN_RST_REQ_OFFSET 0x10
  95 #define COM_CHAN_RST_ACK_OFFSET 0x08
  96 
  97         u32 comm_flags;
  98         u32 rst_req;
  99         u32 rst_ack;
 100         unsigned long end;
 101         struct mlx4_priv *priv = mlx4_priv(dev);
 102 
 103         if (pci_channel_offline(dev->persist->pdev))
 104                 return 0;
 105 
 106         comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 107                                   MLX4_COMM_CHAN_FLAGS));
 108         if (comm_flags == 0xffffffff) {
 109                 mlx4_err(dev, "VF reset is not needed\n");
 110                 return 0;
 111         }
 112 
 113         if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
 114                 mlx4_err(dev, "VF reset is not supported\n");
 115                 return -EOPNOTSUPP;
 116         }
 117 
 118         rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 119                 COM_CHAN_RST_REQ_OFFSET;
 120         rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 121                 COM_CHAN_RST_ACK_OFFSET;
 122         if (rst_req != rst_ack) {
 123                 mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
 124                 return -EIO;
 125         }
 126 
 127         rst_req ^= 1;
 128         mlx4_warn(dev, "VF is sending reset request to Firmware\n");
 129         comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
 130         __raw_writel((__force u32)cpu_to_be32(comm_flags),
 131                      (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
 132 
 133         end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
 134         while (time_before(jiffies, end)) {
 135                 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 136                                           MLX4_COMM_CHAN_FLAGS));
 137                 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 138                         COM_CHAN_RST_ACK_OFFSET;
 139 
 140                 /* Reading rst_req again since the communication channel can
 141                  * be reset at any time by the PF and all its bits will be
 142                  * set to zero.
 143                  */
 144                 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 145                         COM_CHAN_RST_REQ_OFFSET;
 146 
 147                 if (rst_ack == rst_req) {
 148                         mlx4_warn(dev, "VF Reset succeed\n");
 149                         return 0;
 150                 }
 151                 cond_resched();
 152         }
 153         mlx4_err(dev, "Fail to send reset over the communication channel\n");
 154         return -ETIMEDOUT;
 155 }
 156 
 157 int mlx4_comm_internal_err(u32 slave_read)
 158 {
 159         return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
 160                 (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
 161 }
 162 
 163 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 164 {
 165         int err;
 166         struct mlx4_dev *dev;
 167 
 168         if (!mlx4_internal_err_reset)
 169                 return;
 170 
 171         mutex_lock(&persist->device_state_mutex);
 172         if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
 173                 goto out;
 174 
 175         dev = persist->dev;
 176         mlx4_err(dev, "device is going to be reset\n");
 177         if (mlx4_is_slave(dev)) {
 178                 err = mlx4_reset_slave(dev);
 179         } else {
 180                 mlx4_crdump_collect(dev);
 181                 err = mlx4_reset_master(dev);
 182         }
 183 
 184         if (!err) {
 185                 mlx4_err(dev, "device was reset successfully\n");
 186         } else {
 187                 /* EEH could have disabled the PCI channel during reset. That's
 188                  * recoverable and the PCI error flow will handle it.
 189                  */
 190                 if (!pci_channel_offline(dev->persist->pdev))
 191                         BUG_ON(1);
 192         }
 193         dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
 194         mutex_unlock(&persist->device_state_mutex);
 195 
 196         /* At that step HW was already reset, now notify clients */
 197         mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 198         mlx4_cmd_wake_completions(dev);
 199         return;
 200 
 201 out:
 202         mutex_unlock(&persist->device_state_mutex);
 203 }
 204 
 205 static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 206 {
 207         int err = 0;
 208 
 209         mlx4_enter_error_state(persist);
 210         mutex_lock(&persist->interface_state_mutex);
 211         if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
 212             !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
 213                 err = mlx4_restart_one(persist->pdev);
 214                 mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
 215                           err);
 216         }
 217         mutex_unlock(&persist->interface_state_mutex);
 218 }
 219 
 220 static void dump_err_buf(struct mlx4_dev *dev)
 221 {
 222         struct mlx4_priv *priv = mlx4_priv(dev);
 223 
 224         int i;
 225 
 226         mlx4_err(dev, "Internal error detected:\n");
 227         for (i = 0; i < priv->fw.catas_size; ++i)
 228                 mlx4_err(dev, "  buf[%02x]: %08x\n",
 229                          i, swab32(readl(priv->catas_err.map + i)));
 230 }
 231 
 232 static void poll_catas(struct timer_list *t)
 233 {
 234         struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
 235         struct mlx4_dev *dev = &priv->dev;
 236         u32 slave_read;
 237 
 238         if (mlx4_is_slave(dev)) {
 239                 slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
 240                 if (mlx4_comm_internal_err(slave_read)) {
 241                         mlx4_warn(dev, "Internal error detected on the communication channel\n");
 242                         goto internal_err;
 243                 }
 244         } else if (readl(priv->catas_err.map)) {
 245                 dump_err_buf(dev);
 246                 goto internal_err;
 247         }
 248 
 249         if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 250                 mlx4_warn(dev, "Internal error mark was detected on device\n");
 251                 goto internal_err;
 252         }
 253 
 254         mod_timer(&priv->catas_err.timer,
 255                   round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
 256         return;
 257 
 258 internal_err:
 259         if (mlx4_internal_err_reset)
 260                 queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 261 }
 262 
 263 static void catas_reset(struct work_struct *work)
 264 {
 265         struct mlx4_dev_persistent *persist =
 266                 container_of(work, struct mlx4_dev_persistent,
 267                              catas_work);
 268 
 269         mlx4_handle_error_state(persist);
 270 }
 271 
 272 void mlx4_start_catas_poll(struct mlx4_dev *dev)
 273 {
 274         struct mlx4_priv *priv = mlx4_priv(dev);
 275         phys_addr_t addr;
 276 
 277         INIT_LIST_HEAD(&priv->catas_err.list);
 278         timer_setup(&priv->catas_err.timer, poll_catas, 0);
 279         priv->catas_err.map = NULL;
 280 
 281         if (!mlx4_is_slave(dev)) {
 282                 addr = pci_resource_start(dev->persist->pdev,
 283                                           priv->fw.catas_bar) +
 284                                           priv->fw.catas_offset;
 285 
 286                 priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
 287                 if (!priv->catas_err.map) {
 288                         mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
 289                                   (unsigned long long)addr);
 290                         return;
 291                 }
 292         }
 293 
 294         priv->catas_err.timer.expires  =
 295                 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
 296         add_timer(&priv->catas_err.timer);
 297 }
 298 
 299 void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 300 {
 301         struct mlx4_priv *priv = mlx4_priv(dev);
 302 
 303         del_timer_sync(&priv->catas_err.timer);
 304 
 305         if (priv->catas_err.map) {
 306                 iounmap(priv->catas_err.map);
 307                 priv->catas_err.map = NULL;
 308         }
 309 
 310         if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
 311                 flush_workqueue(dev->persist->catas_wq);
 312 }
 313 
 314 int  mlx4_catas_init(struct mlx4_dev *dev)
 315 {
 316         INIT_WORK(&dev->persist->catas_work, catas_reset);
 317         dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
 318         if (!dev->persist->catas_wq)
 319                 return -ENOMEM;
 320 
 321         return 0;
 322 }
 323 
 324 void mlx4_catas_end(struct mlx4_dev *dev)
 325 {
 326         if (dev->persist->catas_wq) {
 327                 destroy_workqueue(dev->persist->catas_wq);
 328                 dev->persist->catas_wq = NULL;
 329         }
 330 }

/* [<][>][^][v][top][bottom][index][help] */