ice: replace custom AIM algorithm with kernel's DIM library

The ice driver has support for adaptive interrupt moderation, an algorithm for tuning the interrupt rate dynamically. This algorithm is based on various assumptions about ring size, socket buffer size, link speed, SKB overhead, ethernet frame overhead and more. The Linux kernel has support for a dynamic interrupt moderation algorithm known as "dimlib". Replace the custom driver-specific implementation of dynamic interrupt moderation with the kernel's algorithm. The Intel hardware has a different hardware implementation than the originators of the dimlib code had to work with, which requires the driver to use a slightly different set of inputs for the actual moderation values, while getting all the advice from dimlib of better/worse, shift left or right. The change made for this implementation is to use a pair of values for each of the 5 "slots" that the dimlib moderation expects, and the driver will program those pairs when dimlib recommends a slot to use. The currently implementation uses two tables, one for receive and one for transmit, and the pairs of values in each slot set the maximum delay of an interrupt and a maximum number of interrupts per second (both expressed in microseconds). There are two separate kinds of bugs fixed by using DIMLIB, one is UDP single stream send was too slow, and the other is that 8K ping-pong was going to the most aggressive moderation and has much too high latency. The overall result of using DIMLIB is that we meet or exceed our performance expectations set based on the old algorithm. Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Tony Brelinski <tonyx.brelinski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
author: Jacob Keller <jacob.e.keller@intel.com> 2021-03-31 14:16:57 -0700
committer: Tony Nguyen <anthony.l.nguyen@intel.com> 2021-04-14 17:00:05 -0700
commit: cdf1f1f169179659621bb540575b3a9d1cd38072 (patch)
tree: 66764ba89f6456c0d937a378a44361c0d29114ea /drivers/net/ethernet/intel/ice/ice_main.c
parent: b8b4772377dd8a916479796c8a8c5425f937fcaf (diff)
download: linux-cdf1f1f169179659621bb540575b3a9d1cd38072.tar.gz
linux-cdf1f1f169179659621bb540575b3a9d1cd38072.tar.bz2
linux-cdf1f1f169179659621bb540575b3a9d1cd38072.zip
1 files changed, 108 insertions, 0 deletions
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 035c593bce61..1e023d163447 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -5196,6 +5196,105 @@ int ice_vsi_cfg(struct ice_vsi *vsi)
 	return err;
 }
 
+/* THEORY OF MODERATION:
+ * The below code creates custom DIM profiles for use by this driver, because
+ * the ice driver hardware works differently than the hardware that DIMLIB was
+ * originally made for. ice hardware doesn't have packet count limits that
+ * can trigger an interrupt, but it *does* have interrupt rate limit support,
+ * and this code adds that capability to be used by the driver when it's using
+ * DIMLIB. The DIMLIB code was always designed to be a suggestion to the driver
+ * for how to "respond" to traffic and interrupts, so this driver uses a
+ * slightly different set of moderation parameters to get best performance.
+ */
+struct ice_dim {
+	/* the throttle rate for interrupts, basically worst case delay before
+	 * an initial interrupt fires, value is stored in microseconds.
+	 */
+	u16 itr;
+	/* the rate limit for interrupts, which can cap a delay from a small
+	 * ITR at a certain amount of interrupts per second. f.e. a 2us ITR
+	 * could yield as much as 500,000 interrupts per second, but with a
+	 * 10us rate limit, it limits to 100,000 interrupts per second. Value
+	 * is stored in microseconds.
+	 */
+	u16 intrl;
+};
+
+/* Make a different profile for Rx that doesn't allow quite so aggressive
+ * moderation at the high end (it maxes out at 128us or about 8k interrupts a
+ * second. The INTRL/rate parameters here are only useful to cap small ITR
+ * values, which is why for larger ITR's - like 128, which can only generate
+ * 8k interrupts per second, there is no point to rate limit and the values
+ * are set to zero. The rate limit values do affect latency, and so must
+ * be reasonably small so to not impact latency sensitive tests.
+ */
+static const struct ice_dim rx_profile[] = {
+	{2, 10},
+	{8, 16},
+	{32, 0},
+	{96, 0},
+	{128, 0}
+};
+
+/* The transmit profile, which has the same sorts of values
+ * as the previous struct
+ */
+static const struct ice_dim tx_profile[] = {
+	{2, 10},
+	{8, 16},
+	{64, 0},
+	{128, 0},
+	{256, 0}
+};
+
+static void ice_tx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct ice_q_vector *q_vector;
+	struct dim *dim;
+	u16 itr, intrl;
+
+	dim = container_of(work, struct dim, work);
+	rc = container_of(dim, struct ice_ring_container, dim);
+	q_vector = container_of(rc, struct ice_q_vector, tx);
+
+	if (dim->profile_ix >= ARRAY_SIZE(tx_profile))
+		dim->profile_ix = ARRAY_SIZE(tx_profile) - 1;
+
+	/* look up the values in our local table */
+	itr = tx_profile[dim->profile_ix].itr;
+	intrl = tx_profile[dim->profile_ix].intrl;
+
+	ice_write_itr(rc, itr);
+	ice_write_intrl(q_vector, intrl);
+
+	dim->state = DIM_START_MEASURE;
+}
+
+static void ice_rx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct ice_q_vector *q_vector;
+	struct dim *dim;
+	u16 itr, intrl;
+
+	dim = container_of(work, struct dim, work);
+	rc = container_of(dim, struct ice_ring_container, dim);
+	q_vector = container_of(rc, struct ice_q_vector, rx);
+
+	if (dim->profile_ix >= ARRAY_SIZE(rx_profile))
+		dim->profile_ix = ARRAY_SIZE(rx_profile) - 1;
+
+	/* look up the values in our local table */
+	itr = rx_profile[dim->profile_ix].itr;
+	intrl = rx_profile[dim->profile_ix].intrl;
+
+	ice_write_itr(rc, itr);
+	ice_write_intrl(q_vector, intrl);
+
+	dim->state = DIM_START_MEASURE;
+}
+
 /**
  * ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
  * @vsi: the VSI being configured
@@ -5210,6 +5309,12 @@ static void ice_napi_enable_all(struct ice_vsi *vsi)
 	ice_for_each_q_vector(vsi, q_idx) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
 
+		INIT_WORK(&q_vector->tx.dim.work, ice_tx_dim_work);
+		q_vector->tx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+
+		INIT_WORK(&q_vector->rx.dim.work, ice_rx_dim_work);
+		q_vector->rx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+
 		if (q_vector->rx.ring || q_vector->tx.ring)
 			napi_enable(&q_vector->napi);
 	}
@@ -5618,6 +5723,9 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
 
 		if (q_vector->rx.ring || q_vector->tx.ring)
 			napi_disable(&q_vector->napi);
+
+		cancel_work_sync(&q_vector->tx.dim.work);
+		cancel_work_sync(&q_vector->rx.dim.work);
 	}
 }
author	Jacob Keller <jacob.e.keller@intel.com>	2021-03-31 14:16:57 -0700
committer	Tony Nguyen <anthony.l.nguyen@intel.com>	2021-04-14 17:00:05 -0700
commit	cdf1f1f169179659621bb540575b3a9d1cd38072 (patch)
tree	66764ba89f6456c0d937a378a44361c0d29114ea /drivers/net/ethernet/intel/ice/ice_main.c
parent	b8b4772377dd8a916479796c8a8c5425f937fcaf (diff)
download	linux-cdf1f1f169179659621bb540575b3a9d1cd38072.tar.gz linux-cdf1f1f169179659621bb540575b3a9d1cd38072.tar.bz2 linux-cdf1f1f169179659621bb540575b3a9d1cd38072.zip