Commit 184e2516 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

Pull more infiniband changes from Roland Dreier:
 "Second batch of InfiniBand/RDMA changes for 3.8:
   - cxgb4 changes to fix lookup engine hash collisions
   - mlx4 changes to make flow steering usable
   - fix to IPoIB to avoid pinning dst reference for too long"

* tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband:
  RDMA/cxgb4: Fix bug for active and passive LE hash collision path
  RDMA/cxgb4: Fix LE hash collision bug for passive open connection
  RDMA/cxgb4: Fix LE hash collision bug for active open connection
  mlx4_core: Allow choosing flow steering mode
  mlx4_core: Adjustments to Flow Steering activation logic for SR-IOV
  mlx4_core: Fix error flow in the flow steering wrapper
  mlx4_core: Add QPN enforcement for flow steering rules set by VFs
  cxgb4: Add LE hash collision bug fix path in LLD driver
  cxgb4: Add T4 filter support
  IPoIB: Call skb_dst_drop() once skb is enqueued for sending
parents 0264405b d72623b6
This diff is collapsed.
......@@ -279,6 +279,11 @@ static int stats_show(struct seq_file *seq, void *v)
seq_printf(seq, " DB State: %s Transitions %llu\n",
db_state_str[dev->db_state],
dev->rdev.stats.db_state_transitions);
seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
dev->rdev.stats.act_ofld_conn_fails);
seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n",
dev->rdev.stats.pas_ofld_conn_fails);
return 0;
}
......@@ -309,6 +314,9 @@ static ssize_t stats_clear(struct file *file, const char __user *buf,
dev->rdev.stats.db_empty = 0;
dev->rdev.stats.db_drop = 0;
dev->rdev.stats.db_state_transitions = 0;
dev->rdev.stats.tcam_full = 0;
dev->rdev.stats.act_ofld_conn_fails = 0;
dev->rdev.stats.pas_ofld_conn_fails = 0;
mutex_unlock(&dev->rdev.stats.lock);
return count;
}
......@@ -322,6 +330,113 @@ static const struct file_operations stats_debugfs_fops = {
.write = stats_clear,
};
static int dump_ep(int id, void *p, void *data)
{
struct c4iw_ep *ep = p;
struct c4iw_debugfs_data *epd = data;
int space;
int cc;
space = epd->bufsize - epd->pos - 1;
if (space == 0)
return 1;
cc = snprintf(epd->buf + epd->pos, space,
"ep %p cm_id %p qp %p state %d flags 0x%lx history 0x%lx "
"hwtid %d atid %d %pI4:%d <-> %pI4:%d\n",
ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state,
ep->com.flags, ep->com.history, ep->hwtid, ep->atid,
&ep->com.local_addr.sin_addr.s_addr,
ntohs(ep->com.local_addr.sin_port),
&ep->com.remote_addr.sin_addr.s_addr,
ntohs(ep->com.remote_addr.sin_port));
if (cc < space)
epd->pos += cc;
return 0;
}
static int dump_listen_ep(int id, void *p, void *data)
{
struct c4iw_listen_ep *ep = p;
struct c4iw_debugfs_data *epd = data;
int space;
int cc;
space = epd->bufsize - epd->pos - 1;
if (space == 0)
return 1;
cc = snprintf(epd->buf + epd->pos, space,
"ep %p cm_id %p state %d flags 0x%lx stid %d backlog %d "
"%pI4:%d\n", ep, ep->com.cm_id, (int)ep->com.state,
ep->com.flags, ep->stid, ep->backlog,
&ep->com.local_addr.sin_addr.s_addr,
ntohs(ep->com.local_addr.sin_port));
if (cc < space)
epd->pos += cc;
return 0;
}
static int ep_release(struct inode *inode, struct file *file)
{
struct c4iw_debugfs_data *epd = file->private_data;
if (!epd) {
pr_info("%s null qpd?\n", __func__);
return 0;
}
vfree(epd->buf);
kfree(epd);
return 0;
}
static int ep_open(struct inode *inode, struct file *file)
{
struct c4iw_debugfs_data *epd;
int ret = 0;
int count = 1;
epd = kmalloc(sizeof(*epd), GFP_KERNEL);
if (!epd) {
ret = -ENOMEM;
goto out;
}
epd->devp = inode->i_private;
epd->pos = 0;
spin_lock_irq(&epd->devp->lock);
idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
spin_unlock_irq(&epd->devp->lock);
epd->bufsize = count * 160;
epd->buf = vmalloc(epd->bufsize);
if (!epd->buf) {
ret = -ENOMEM;
goto err1;
}
spin_lock_irq(&epd->devp->lock);
idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
spin_unlock_irq(&epd->devp->lock);
file->private_data = epd;
goto out;
err1:
kfree(epd);
out:
return ret;
}
static const struct file_operations ep_debugfs_fops = {
.owner = THIS_MODULE,
.open = ep_open,
.release = ep_release,
.read = debugfs_read,
};
static int setup_debugfs(struct c4iw_dev *devp)
{
struct dentry *de;
......@@ -344,6 +459,11 @@ static int setup_debugfs(struct c4iw_dev *devp)
if (de && de->d_inode)
de->d_inode->i_size = 4096;
de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root,
(void *)devp, &ep_debugfs_fops);
if (de && de->d_inode)
de->d_inode->i_size = 4096;
return 0;
}
......@@ -475,6 +595,9 @@ static void c4iw_dealloc(struct uld_ctx *ctx)
idr_destroy(&ctx->dev->cqidr);
idr_destroy(&ctx->dev->qpidr);
idr_destroy(&ctx->dev->mmidr);
idr_destroy(&ctx->dev->hwtid_idr);
idr_destroy(&ctx->dev->stid_idr);
idr_destroy(&ctx->dev->atid_idr);
iounmap(ctx->dev->rdev.oc_mw_kva);
ib_dealloc_device(&ctx->dev->ibdev);
ctx->dev = NULL;
......@@ -532,6 +655,9 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
idr_init(&devp->cqidr);
idr_init(&devp->qpidr);
idr_init(&devp->mmidr);
idr_init(&devp->hwtid_idr);
idr_init(&devp->stid_idr);
idr_init(&devp->atid_idr);
spin_lock_init(&devp->lock);
mutex_init(&devp->rdev.stats.lock);
mutex_init(&devp->db_mutex);
......@@ -577,14 +703,76 @@ out:
return ctx;
}
static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
const __be64 *rsp,
u32 pktshift)
{
struct sk_buff *skb;
/*
* Allocate space for cpl_pass_accept_req which will be synthesized by
* driver. Once the driver synthesizes the request the skb will go
* through the regular cpl_pass_accept_req processing.
* The math here assumes sizeof cpl_pass_accept_req >= sizeof
* cpl_rx_pkt.
*/
skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) +
sizeof(struct rss_header) - pktshift, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
__skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) +
sizeof(struct rss_header) - pktshift);
/*
* This skb will contain:
* rss_header from the rspq descriptor (1 flit)
* cpl_rx_pkt struct from the rspq descriptor (2 flits)
* space for the difference between the size of an
* rx_pkt and pass_accept_req cpl (1 flit)
* the packet data from the gl
*/
skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) +
sizeof(struct rss_header));
skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) +
sizeof(struct cpl_pass_accept_req),
gl->va + pktshift,
gl->tot_len - pktshift);
return skb;
}
static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl,
const __be64 *rsp)
{
unsigned int opcode = *(u8 *)rsp;
struct sk_buff *skb;
if (opcode != CPL_RX_PKT)
goto out;
skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift);
if (skb == NULL)
goto out;
if (c4iw_handlers[opcode] == NULL) {
pr_info("%s no handler opcode 0x%x...\n", __func__,
opcode);
kfree_skb(skb);
goto out;
}
c4iw_handlers[opcode](dev, skb);
return 1;
out:
return 0;
}
static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
const struct pkt_gl *gl)
{
struct uld_ctx *ctx = handle;
struct c4iw_dev *dev = ctx->dev;
struct sk_buff *skb;
const struct cpl_act_establish *rpl;
unsigned int opcode;
u8 opcode;
if (gl == NULL) {
/* omit RSS and rsp_ctrl at end of descriptor */
......@@ -600,6 +788,18 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
u32 qid = be32_to_cpu(rc->pldbuflen_qid);
c4iw_ev_handler(dev, qid);
return 0;
} else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) {
if (recv_rx_pkt(dev, gl, rsp))
return 0;
pr_info("%s: unexpected FL contents at %p, " \
"RSS %#llx, FL %#llx, len %u\n",
pci_name(ctx->lldi.pdev), gl->va,
(unsigned long long)be64_to_cpu(*rsp),
(unsigned long long)be64_to_cpu(*(u64 *)gl->va),
gl->tot_len);
return 0;
} else {
skb = cxgb4_pktgl_to_skb(gl, 128, 128);
......@@ -607,13 +807,11 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
goto nomem;
}
rpl = cplhdr(skb);
opcode = rpl->ot.opcode;
opcode = *(u8 *)rsp;
if (c4iw_handlers[opcode])
c4iw_handlers[opcode](dev, skb);
else
printk(KERN_INFO "%s no handler opcode 0x%x...\n", __func__,
pr_info("%s no handler opcode 0x%x...\n", __func__,
opcode);
return 0;
......
......@@ -130,6 +130,9 @@ struct c4iw_stats {
u64 db_empty;
u64 db_drop;
u64 db_state_transitions;
u64 tcam_full;
u64 act_ofld_conn_fails;
u64 pas_ofld_conn_fails;
};
struct c4iw_rdev {
......@@ -223,6 +226,9 @@ struct c4iw_dev {
struct dentry *debugfs_root;
enum db_state db_state;
int qpcnt;
struct idr hwtid_idr;
struct idr atid_idr;
struct idr stid_idr;
};
static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
......@@ -712,6 +718,31 @@ enum c4iw_ep_flags {
CLOSE_SENT = 3,
};
enum c4iw_ep_history {
ACT_OPEN_REQ = 0,
ACT_OFLD_CONN = 1,
ACT_OPEN_RPL = 2,
ACT_ESTAB = 3,
PASS_ACCEPT_REQ = 4,
PASS_ESTAB = 5,
ABORT_UPCALL = 6,
ESTAB_UPCALL = 7,
CLOSE_UPCALL = 8,
ULP_ACCEPT = 9,
ULP_REJECT = 10,
TIMEDOUT = 11,
PEER_ABORT = 12,
PEER_CLOSE = 13,
CONNREQ_UPCALL = 14,
ABORT_CONN = 15,
DISCONN_UPCALL = 16,
EP_DISC_CLOSE = 17,
EP_DISC_ABORT = 18,
CONN_RPL_UPCALL = 19,
ACT_RETRY_NOMEM = 20,
ACT_RETRY_INUSE = 21
};
struct c4iw_ep_common {
struct iw_cm_id *cm_id;
struct c4iw_qp *qp;
......@@ -723,6 +754,7 @@ struct c4iw_ep_common {
struct sockaddr_in remote_addr;
struct c4iw_wr_wait wr_wait;
unsigned long flags;
unsigned long history;
};
struct c4iw_listen_ep {
......@@ -760,6 +792,7 @@ struct c4iw_ep {
u8 tos;
u8 retry_with_mpa_v1;
u8 tried_with_mpa_v1;
unsigned int retry_count;
};
static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
......
......@@ -752,6 +752,9 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
dev->trans_start = jiffies;
++tx->tx_head;
skb_orphan(skb);
skb_dst_drop(skb);
if (++priv->tx_outstanding == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
tx->qp->qp_num);
......
......@@ -615,8 +615,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
address->last_send = priv->tx_head;
++priv->tx_head;
skb_orphan(skb);
skb_orphan(skb);
skb_dst_drop(skb);
}
if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
......
......@@ -35,6 +35,8 @@
#ifndef __CXGB4_H__
#define __CXGB4_H__
#include "t4_hw.h"
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
......@@ -212,6 +214,8 @@ struct tp_err_stats {
struct tp_params {
unsigned int ntxchan; /* # of Tx channels */
unsigned int tre; /* log2 of core clocks per TP tick */
unsigned short tx_modq_map; /* TX modulation scheduler queue to */
/* channel map */
uint32_t dack_re; /* DACK timer resolution */
unsigned short tx_modq[NCHAN]; /* channel to modulation queue map */
......@@ -526,6 +530,7 @@ struct adapter {
struct net_device *port[MAX_NPORTS];
u8 chan_map[NCHAN]; /* channel -> port map */
u32 filter_mode;
unsigned int l2t_start;
unsigned int l2t_end;
struct l2t_data *l2t;
......@@ -545,6 +550,129 @@ struct adapter {
spinlock_t stats_lock;
};
/* Defined bit width of user definable filter tuples
*/
#define ETHTYPE_BITWIDTH 16
#define FRAG_BITWIDTH 1
#define MACIDX_BITWIDTH 9
#define FCOE_BITWIDTH 1
#define IPORT_BITWIDTH 3
#define MATCHTYPE_BITWIDTH 3
#define PROTO_BITWIDTH 8
#define TOS_BITWIDTH 8
#define PF_BITWIDTH 8
#define VF_BITWIDTH 8
#define IVLAN_BITWIDTH 16
#define OVLAN_BITWIDTH 16
/* Filter matching rules. These consist of a set of ingress packet field
* (value, mask) tuples. The associated ingress packet field matches the
* tuple when ((field & mask) == value). (Thus a wildcard "don't care" field
* rule can be constructed by specifying a tuple of (0, 0).) A filter rule
* matches an ingress packet when all of the individual individual field
* matching rules are true.
*
* Partial field masks are always valid, however, while it may be easy to
* understand their meanings for some fields (e.g. IP address to match a
* subnet), for others making sensible partial masks is less intuitive (e.g.
* MPS match type) ...
*
* Most of the following data structures are modeled on T4 capabilities.
* Drivers for earlier chips use the subsets which make sense for those chips.
* We really need to come up with a hardware-independent mechanism to
* represent hardware filter capabilities ...
*/
struct ch_filter_tuple {
/* Compressed header matching field rules. The TP_VLAN_PRI_MAP
* register selects which of these fields will participate in the
* filter match rules -- up to a maximum of 36 bits. Because
* TP_VLAN_PRI_MAP is a global register, all filters must use the same
* set of fields.
*/
uint32_t ethtype:ETHTYPE_BITWIDTH; /* Ethernet type */
uint32_t frag:FRAG_BITWIDTH; /* IP fragmentation header */
uint32_t ivlan_vld:1; /* inner VLAN valid */
uint32_t ovlan_vld:1; /* outer VLAN valid */
uint32_t pfvf_vld:1; /* PF/VF valid */
uint32_t macidx:MACIDX_BITWIDTH; /* exact match MAC index */
uint32_t fcoe:FCOE_BITWIDTH; /* FCoE packet */
uint32_t iport:IPORT_BITWIDTH; /* ingress port */
uint32_t matchtype:MATCHTYPE_BITWIDTH; /* MPS match type */
uint32_t proto:PROTO_BITWIDTH; /* protocol type */
uint32_t tos:TOS_BITWIDTH; /* TOS/Traffic Type */
uint32_t pf:PF_BITWIDTH; /* PCI-E PF ID */
uint32_t vf:VF_BITWIDTH; /* PCI-E VF ID */
uint32_t ivlan:IVLAN_BITWIDTH; /* inner VLAN */
uint32_t ovlan:OVLAN_BITWIDTH; /* outer VLAN */
/* Uncompressed header matching field rules. These are always
* available for field rules.
*/
uint8_t lip[16]; /* local IP address (IPv4 in [3:0]) */
uint8_t fip[16]; /* foreign IP address (IPv4 in [3:0]) */
uint16_t lport; /* local port */
uint16_t fport; /* foreign port */
};
/* A filter ioctl command.
*/
struct ch_filter_specification {
/* Administrative fields for filter.
*/
uint32_t hitcnts:1; /* count filter hits in TCB */
uint32_t prio:1; /* filter has priority over active/server */
/* Fundamental filter typing. This is the one element of filter
* matching that doesn't exist as a (value, mask) tuple.
*/
uint32_t type:1; /* 0 => IPv4, 1 => IPv6 */
/* Packet dispatch information. Ingress packets which match the
* filter rules will be dropped, passed to the host or switched back
* out as egress packets.
*/
uint32_t action:2; /* drop, pass, switch */
uint32_t rpttid:1; /* report TID in RSS hash field */
uint32_t dirsteer:1; /* 0 => RSS, 1 => steer to iq */
uint32_t iq:10; /* ingress queue */
uint32_t maskhash:1; /* dirsteer=0: store RSS hash in TCB */
uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */
/* 1 => TCB contains IQ ID */
/* Switch proxy/rewrite fields. An ingress packet which matches a
* filter with "switch" set will be looped back out as an egress
* packet -- potentially with some Ethernet header rewriting.
*/
uint32_t eport:2; /* egress port to switch packet out */
uint32_t newdmac:1; /* rewrite destination MAC address */
uint32_t newsmac:1; /* rewrite source MAC address */
uint32_t newvlan:2; /* rewrite VLAN Tag */
uint8_t dmac[ETH_ALEN]; /* new destination MAC address */
uint8_t smac[ETH_ALEN]; /* new source MAC address */
uint16_t vlan; /* VLAN Tag to insert */
/* Filter rule value/mask pairs.
*/
struct ch_filter_tuple val;
struct ch_filter_tuple mask;
};
enum {
FILTER_PASS = 0, /* default */
FILTER_DROP,
FILTER_SWITCH
};
enum {
VLAN_NOCHANGE = 0, /* default */
VLAN_REMOVE,
VLAN_INSERT,
VLAN_REWRITE
};
static inline u32 t4_read_reg(struct adapter *adap, u32 reg_addr)
{
return readl(adap->regs + reg_addr);
......@@ -701,6 +829,12 @@ static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd,
void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
unsigned int data_reg, const u32 *vals,
unsigned int nregs, unsigned int start_idx);
void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
unsigned int data_reg, u32 *vals, unsigned int nregs,
unsigned int start_idx);
struct fw_filter_wr;
void t4_intr_enable(struct adapter *adapter);
void t4_intr_disable(struct adapter *adapter);
int t4_slow_intr_handler(struct adapter *adapter);
......@@ -737,6 +871,8 @@ void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
const unsigned short *alpha, const unsigned short *beta);
void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid);
void t4_wol_magic_enable(struct adapter *adap, unsigned int port,
const u8 *addr);
int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
......
......@@ -175,6 +175,30 @@ enum {
MIN_FL_ENTRIES = 16
};
/* Host shadow copy of ingress filter entry. This is in host native format
* and doesn't match the ordering or bit order, etc. of the hardware of the
* firmware command. The use of bit-field structure elements is purely to
* remind ourselves of the field size limitations and save memory in the case
* where the filter table is large.
*/
struct filter_entry {
/* Administrative fields for filter.
*/
u32 valid:1; /* filter allocated and valid */
u32 locked:1; /* filter is administratively locked */
u32 pending:1; /* filter action is pending firmware reply */
u32 smtidx:8; /* Source MAC Table index for smac */
struct l2t_entry *l2t; /* Layer Two Table entry for dmac */
/* The filter itself. Most of this is a straight copy of information
* provided by the extended ioctl(). Some fields are translated to
* internal forms -- for instance the Ingress Queue ID passed in from
* the ioctl() is translated into the Absolute Ingress Queue ID.
*/
struct ch_filter_specification fs;
};
#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
......@@ -325,6 +349,9 @@ enum {
static unsigned int tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT;
module_param(tp_vlan_pri_map, uint, 0644);
MODULE_PARM_DESC(tp_vlan_pri_map, "global compressed filter configuration");
static struct dentry *cxgb4_debugfs_root;
static LIST_HEAD(adapter_list);
......@@ -506,8 +533,67 @@ static int link_start(struct net_device *dev)
return ret;
}
/*
* Response queue handler for the FW event queue.
/* Clear a filter and release any of its resources that we own. This also
* clears the filter's "pending" status.
*/
static void clear_filter(struct adapter *adap, struct filter_entry *f)
{
/* If the new or old filter have loopback rewriteing rules then we'll
* need to free any existing Layer Two Table (L2T) entries of the old
* filter rule. The firmware will handle freeing up any Source MAC
* Table (SMT) entries used for rewriting Source MAC Addresses in
* loopback rules.
*/
if (f->l2t)