aboutsummaryrefslogtreecommitdiff
path: root/drivers/pci/controller/pci-hyperv.c
diff options
context:
space:
mode:
authorNick Terrell <terrelln@fb.com>2022-12-13 16:21:55 -0800
committerNick Terrell <terrelln@fb.com>2022-12-13 16:21:55 -0800
commit4f2c0a4acffbec01079c28f839422e64ddeff004 (patch)
tree06ada4a8a6d94a94c93944806041b8c994cebfc5 /drivers/pci/controller/pci-hyperv.c
parent88a309465b3f05a100c3b81966982c0f9f5d23a6 (diff)
parent830b3c68c1fb1e9176028d02ef86f3cf76aa2476 (diff)
downloadlinux-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.gz
linux-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.bz2
linux-4f2c0a4acffbec01079c28f839422e64ddeff004.zip
Merge branch 'main' into zstd-linus
Diffstat (limited to 'drivers/pci/controller/pci-hyperv.c')
-rw-r--r--drivers/pci/controller/pci-hyperv.c586
1 files changed, 398 insertions, 188 deletions
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 20ea2ee330b8..f1ec8931dfbc 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -92,6 +92,13 @@ static enum pci_protocol_version_t pci_protocol_versions[] = {
#define SLOT_NAME_SIZE 11
/*
+ * Size of requestor for VMbus; the value is based on the observation
+ * that having more than one request outstanding is 'rare', and so 64
+ * should be generous in ensuring that we don't ever run out.
+ */
+#define HV_PCI_RQSTOR_SIZE 64
+
+/*
* Message Types
*/
@@ -604,17 +611,137 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data)
return cfg->vector;
}
-static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
- struct msi_desc *msi_desc)
+static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *info)
{
- msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
- msi_entry->data.as_uint32 = msi_desc->msg.data;
+ int ret = pci_msi_prepare(domain, dev, nvec, info);
+
+ /*
+ * By using the interrupt remapper in the hypervisor IOMMU, contiguous
+ * CPU vectors is not needed for multi-MSI
+ */
+ if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
+ info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+ return ret;
}
-static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *info)
+/**
+ * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
+ * affinity.
+ * @data: Describes the IRQ
+ *
+ * Build new a destination for the MSI and make a hypercall to
+ * update the Interrupt Redirection Table. "Device Logical ID"
+ * is built out of this PCI bus's instance GUID and the function
+ * number of the device.
+ */
+static void hv_arch_irq_unmask(struct irq_data *data)
{
- return pci_msi_prepare(domain, dev, nvec, info);
+ struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
+ struct hv_retarget_device_interrupt *params;
+ struct tran_int_desc *int_desc;
+ struct hv_pcibus_device *hbus;
+ const struct cpumask *dest;
+ cpumask_var_t tmp;
+ struct pci_bus *pbus;
+ struct pci_dev *pdev;
+ unsigned long flags;
+ u32 var_size = 0;
+ int cpu, nr_bank;
+ u64 res;
+
+ dest = irq_data_get_effective_affinity_mask(data);
+ pdev = msi_desc_to_pci_dev(msi_desc);
+ pbus = pdev->bus;
+ hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+ int_desc = data->chip_data;
+
+ spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
+
+ params = &hbus->retarget_msi_interrupt_params;
+ memset(params, 0, sizeof(*params));
+ params->partition_id = HV_PARTITION_ID_SELF;
+ params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
+ params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
+ params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
+ params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
+ (hbus->hdev->dev_instance.b[4] << 16) |
+ (hbus->hdev->dev_instance.b[7] << 8) |
+ (hbus->hdev->dev_instance.b[6] & 0xf8) |
+ PCI_FUNC(pdev->devfn);
+ params->int_target.vector = hv_msi_get_int_vector(data);
+
+ /*
+ * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
+ * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
+ * spurious interrupt storm. Not doing so does not seem to have a
+ * negative effect (yet?).
+ */
+
+ if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
+ /*
+ * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
+ * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
+ * with >64 VP support.
+ * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
+ * is not sufficient for this hypercall.
+ */
+ params->int_target.flags |=
+ HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
+ res = 1;
+ goto exit_unlock;
+ }
+
+ cpumask_and(tmp, dest, cpu_online_mask);
+ nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
+ free_cpumask_var(tmp);
+
+ if (nr_bank <= 0) {
+ res = 1;
+ goto exit_unlock;
+ }
+
+ /*
+ * var-sized hypercall, var-size starts after vp_mask (thus
+ * vp_set.format does not count, but vp_set.valid_bank_mask
+ * does).
+ */
+ var_size = 1 + nr_bank;
+ } else {
+ for_each_cpu_and(cpu, dest, cpu_online_mask) {
+ params->int_target.vp_mask |=
+ (1ULL << hv_cpu_number_to_vp_number(cpu));
+ }
+ }
+
+ res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
+ params, NULL);
+
+exit_unlock:
+ spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
+
+ /*
+ * During hibernation, when a CPU is offlined, the kernel tries
+ * to move the interrupt to the remaining CPUs that haven't
+ * been offlined yet. In this case, the below hv_do_hypercall()
+ * always fails since the vmbus channel has been closed:
+ * refer to cpu_disable_common() -> fixup_irqs() ->
+ * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
+ *
+ * Suppress the error message for hibernation because the failure
+ * during hibernation does not matter (at this time all the devices
+ * have been frozen). Note: the correct affinity info is still updated
+ * into the irqdata data structure in migrate_one_irq() ->
+ * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
+ * resumes, hv_pci_restore_msi_state() is able to correctly restore
+ * the interrupt with the correct affinity.
+ */
+ if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
+ dev_err(&hbus->hdev->device,
+ "%s() failed: %#llx", __func__, res);
}
#elif defined(CONFIG_ARM64)
/*
@@ -651,14 +778,6 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
return irqd->parent_data->hwirq;
}
-static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
- struct msi_desc *msi_desc)
-{
- msi_entry->address = ((u64)msi_desc->msg.address_hi << 32) |
- msi_desc->msg.address_lo;
- msi_entry->data = msi_desc->msg.data;
-}
-
/*
* @nr_bm_irqs: Indicates the number of IRQs that were allocated from
* the bitmap.
@@ -839,6 +958,12 @@ static struct irq_domain *hv_pci_get_root_domain(void)
{
return hv_msi_gic_irq_domain;
}
+
+/*
+ * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
+ * registers which Hyper-V already supports, so no hypercall needed.
+ */
+static void hv_arch_irq_unmask(struct irq_data *data) { }
#endif /* CONFIG_ARM64 */
/**
@@ -856,11 +981,7 @@ static void hv_pci_generic_compl(void *context, struct pci_response *resp,
{
struct hv_pci_compl *comp_pkt = context;
- if (resp_packet_size >= offsetofend(struct pci_response, status))
- comp_pkt->completion_status = resp->status;
- else
- comp_pkt->completion_status = -1;
-
+ comp_pkt->completion_status = resp->status;
complete(&comp_pkt->host_event);
}
@@ -1400,6 +1521,10 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev,
u8 buffer[sizeof(struct pci_delete_interrupt)];
} ctxt;
+ if (!int_desc->vector_count) {
+ kfree(int_desc);
+ return;
+ }
memset(&ctxt, 0, sizeof(ctxt));
int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
int_pkt->message_type.type =
@@ -1407,7 +1532,7 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev,
int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
int_pkt->int_desc = *int_desc;
vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
- (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
+ 0, VM_PKT_DATA_INBAND, 0);
kfree(int_desc);
}
@@ -1456,119 +1581,9 @@ static void hv_irq_mask(struct irq_data *data)
irq_chip_mask_parent(data);
}
-/**
- * hv_irq_unmask() - "Unmask" the IRQ by setting its current
- * affinity.
- * @data: Describes the IRQ
- *
- * Build new a destination for the MSI and make a hypercall to
- * update the Interrupt Redirection Table. "Device Logical ID"
- * is built out of this PCI bus's instance GUID and the function
- * number of the device.
- */
static void hv_irq_unmask(struct irq_data *data)
{
- struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
- struct hv_retarget_device_interrupt *params;
- struct hv_pcibus_device *hbus;
- struct cpumask *dest;
- cpumask_var_t tmp;
- struct pci_bus *pbus;
- struct pci_dev *pdev;
- unsigned long flags;
- u32 var_size = 0;
- int cpu, nr_bank;
- u64 res;
-
- dest = irq_data_get_effective_affinity_mask(data);
- pdev = msi_desc_to_pci_dev(msi_desc);
- pbus = pdev->bus;
- hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
-
- spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
-
- params = &hbus->retarget_msi_interrupt_params;
- memset(params, 0, sizeof(*params));
- params->partition_id = HV_PARTITION_ID_SELF;
- params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
- hv_set_msi_entry_from_desc(&params->int_entry.msi_entry, msi_desc);
- params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
- (hbus->hdev->dev_instance.b[4] << 16) |
- (hbus->hdev->dev_instance.b[7] << 8) |
- (hbus->hdev->dev_instance.b[6] & 0xf8) |
- PCI_FUNC(pdev->devfn);
- params->int_target.vector = hv_msi_get_int_vector(data);
-
- /*
- * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
- * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
- * spurious interrupt storm. Not doing so does not seem to have a
- * negative effect (yet?).
- */
-
- if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
- /*
- * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
- * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
- * with >64 VP support.
- * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
- * is not sufficient for this hypercall.
- */
- params->int_target.flags |=
- HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
-
- if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
- res = 1;
- goto exit_unlock;
- }
-
- cpumask_and(tmp, dest, cpu_online_mask);
- nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
- free_cpumask_var(tmp);
-
- if (nr_bank <= 0) {
- res = 1;
- goto exit_unlock;
- }
-
- /*
- * var-sized hypercall, var-size starts after vp_mask (thus
- * vp_set.format does not count, but vp_set.valid_bank_mask
- * does).
- */
- var_size = 1 + nr_bank;
- } else {
- for_each_cpu_and(cpu, dest, cpu_online_mask) {
- params->int_target.vp_mask |=
- (1ULL << hv_cpu_number_to_vp_number(cpu));
- }
- }
-
- res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
- params, NULL);
-
-exit_unlock:
- spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
-
- /*
- * During hibernation, when a CPU is offlined, the kernel tries
- * to move the interrupt to the remaining CPUs that haven't
- * been offlined yet. In this case, the below hv_do_hypercall()
- * always fails since the vmbus channel has been closed:
- * refer to cpu_disable_common() -> fixup_irqs() ->
- * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
- *
- * Suppress the error message for hibernation because the failure
- * during hibernation does not matter (at this time all the devices
- * have been frozen). Note: the correct affinity info is still updated
- * into the irqdata data structure in migrate_one_irq() ->
- * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
- * resumes, hv_pci_restore_msi_state() is able to correctly restore
- * the interrupt with the correct affinity.
- */
- if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
- dev_err(&hbus->hdev->device,
- "%s() failed: %#llx", __func__, res);
+ hv_arch_irq_unmask(data);
if (data->parent_data->chip->irq_unmask)
irq_chip_unmask_parent(data);
@@ -1587,19 +1602,24 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp,
struct pci_create_int_response *int_resp =
(struct pci_create_int_response *)resp;
+ if (resp_packet_size < sizeof(*int_resp)) {
+ comp_pkt->comp_pkt.completion_status = -1;
+ goto out;
+ }
comp_pkt->comp_pkt.completion_status = resp->status;
comp_pkt->int_desc = int_resp->int_desc;
+out:
complete(&comp_pkt->comp_pkt.host_event);
}
static u32 hv_compose_msi_req_v1(
- struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
- u32 slot, u8 vector)
+ struct pci_create_interrupt *int_pkt,
+ u32 slot, u8 vector, u16 vector_count)
{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
- int_pkt->int_desc.vector_count = 1;
+ int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
/*
@@ -1612,26 +1632,76 @@ static u32 hv_compose_msi_req_v1(
}
/*
+ * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
+ * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
+ * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
+ * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
+ * not irrelevant because Hyper-V chooses the physical CPU to handle the
+ * interrupts based on the vCPU specified in message sent to the vPCI VSP in
+ * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
+ * but assigning too many vPCI device interrupts to the same pCPU can cause a
+ * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
+ * to spread out the pCPUs that it selects.
+ *
+ * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
+ * to always return the same dummy vCPU, because a second call to
+ * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
+ * new pCPU for the interrupt. But for the multi-MSI case, the second call to
+ * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
+ * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
+ * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
+ * the same pCPU, even though the vCPUs will be spread out by later calls
+ * to hv_irq_unmask(), but that is the best we can do now.
+ *
+ * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
+ * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
+ * enhancement is planned for a future version. With that enhancement, the
+ * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
+ * device will be spread across multiple pCPUs.
+ */
+
+/*
* Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
* by subsequent retarget in hv_irq_unmask().
*/
-static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
+static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
{
return cpumask_first_and(affinity, cpu_online_mask);
}
-static u32 hv_compose_msi_req_v2(
- struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
- u32 slot, u8 vector)
+/*
+ * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
+ */
+static int hv_compose_multi_msi_req_get_cpu(void)
{
+ static DEFINE_SPINLOCK(multi_msi_cpu_lock);
+
+ /* -1 means starting with CPU 0 */
+ static int cpu_next = -1;
+
+ unsigned long flags;
int cpu;
+ spin_lock_irqsave(&multi_msi_cpu_lock, flags);
+
+ cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
+ false);
+ cpu = cpu_next;
+
+ spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);
+
+ return cpu;
+}
+
+static u32 hv_compose_msi_req_v2(
+ struct pci_create_interrupt2 *int_pkt, int cpu,
+ u32 slot, u8 vector, u16 vector_count)
+{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
- int_pkt->int_desc.vector_count = 1;
+ int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
- cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
@@ -1640,18 +1710,15 @@ static u32 hv_compose_msi_req_v2(
}
static u32 hv_compose_msi_req_v3(
- struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
- u32 slot, u32 vector)
+ struct pci_create_interrupt3 *int_pkt, int cpu,
+ u32 slot, u32 vector, u16 vector_count)
{
- int cpu;
-
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
int_pkt->int_desc.reserved = 0;
- int_pkt->int_desc.vector_count = 1;
+ int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
- cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
@@ -1677,9 +1744,16 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct hv_pci_dev *hpdev;
struct pci_bus *pbus;
struct pci_dev *pdev;
- struct cpumask *dest;
+ const struct cpumask *dest;
struct compose_comp_ctxt comp;
struct tran_int_desc *int_desc;
+ struct msi_desc *msi_desc;
+ /*
+ * vector_count should be u16: see hv_msi_desc, hv_msi_desc2
+ * and hv_msi_desc3. vector must be u32: see hv_msi_desc3.
+ */
+ u16 vector_count;
+ u32 vector;
struct {
struct pci_packet pci_pkt;
union {
@@ -1688,11 +1762,26 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct pci_create_interrupt3 v3;
} int_pkts;
} __packed ctxt;
-
+ bool multi_msi;
+ u64 trans_id;
u32 size;
int ret;
+ int cpu;
+
+ msi_desc = irq_data_get_msi_desc(data);
+ multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
+ msi_desc->nvec_used > 1;
+
+ /* Reuse the previous allocation */
+ if (data->chip_data && multi_msi) {
+ int_desc = data->chip_data;
+ msg->address_hi = int_desc->address >> 32;
+ msg->address_lo = int_desc->address & 0xffffffff;
+ msg->data = int_desc->data;
+ return;
+ }
- pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
+ pdev = msi_desc_to_pci_dev(msi_desc);
dest = irq_data_get_effective_affinity_mask(data);
pbus = pdev->bus;
hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
@@ -1702,7 +1791,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
goto return_null_message;
/* Free any previous message that might have already been composed. */
- if (data->chip_data) {
+ if (data->chip_data && !multi_msi) {
int_desc = data->chip_data;
data->chip_data = NULL;
hv_int_desc_free(hpdev, int_desc);
@@ -1712,6 +1801,43 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
if (!int_desc)
goto drop_reference;
+ if (multi_msi) {
+ /*
+ * If this is not the first MSI of Multi MSI, we already have
+ * a mapping. Can exit early.
+ */
+ if (msi_desc->irq != data->irq) {
+ data->chip_data = int_desc;
+ int_desc->address = msi_desc->msg.address_lo |
+ (u64)msi_desc->msg.address_hi << 32;
+ int_desc->data = msi_desc->msg.data +
+ (data->irq - msi_desc->irq);
+ msg->address_hi = msi_desc->msg.address_hi;
+ msg->address_lo = msi_desc->msg.address_lo;
+ msg->data = int_desc->data;
+ put_pcichild(hpdev);
+ return;
+ }
+ /*
+ * The vector we select here is a dummy value. The correct
+ * value gets sent to the hypervisor in unmask(). This needs
+ * to be aligned with the count, and also not zero. Multi-msi
+ * is powers of 2 up to 32, so 32 will always work here.
+ */
+ vector = 32;
+ vector_count = msi_desc->nvec_used;
+ cpu = hv_compose_multi_msi_req_get_cpu();
+ } else {
+ vector = hv_msi_get_int_vector(data);
+ vector_count = 1;
+ cpu = hv_compose_msi_req_get_cpu(dest);
+ }
+
+ /*
+ * hv_compose_msi_req_v1 and v2 are for x86 only, meaning 'vector'
+ * can't exceed u8. Cast 'vector' down to u8 for v1/v2 explicitly
+ * for better readability.
+ */
memset(&ctxt, 0, sizeof(ctxt));
init_completion(&comp.comp_pkt.host_event);
ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
@@ -1720,24 +1846,26 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
switch (hbus->protocol_version) {
case PCI_PROTOCOL_VERSION_1_1:
size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
- dest,
hpdev->desc.win_slot.slot,
- hv_msi_get_int_vector(data));
+ (u8)vector,
+ vector_count);
break;
case PCI_PROTOCOL_VERSION_1_2:
case PCI_PROTOCOL_VERSION_1_3:
size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
- dest,
+ cpu,
hpdev->desc.win_slot.slot,
- hv_msi_get_int_vector(data));
+ (u8)vector,
+ vector_count);
break;
case PCI_PROTOCOL_VERSION_1_4:
size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
- dest,
+ cpu,
hpdev->desc.win_slot.slot,
- hv_msi_get_int_vector(data));
+ vector,
+ vector_count);
break;
default:
@@ -1750,10 +1878,10 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
goto free_int_desc;
}
- ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
- size, (unsigned long)&ctxt.pci_pkt,
- VM_PKT_DATA_INBAND,
- VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+ ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
+ size, (unsigned long)&ctxt.pci_pkt,
+ &trans_id, VM_PKT_DATA_INBAND,
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
if (ret) {
dev_err(&hbus->hdev->device,
"Sending request for interrupt failed: 0x%x",
@@ -1832,6 +1960,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
enable_tasklet:
tasklet_enable(&channel->callback_event);
+ /*
+ * The completion packet on the stack becomes invalid after 'return';
+ * remove the ID from the VMbus requestor if the identifier is still
+ * mapped to/associated with the packet. (The identifier could have
+ * been 're-used', i.e., already removed and (re-)mapped.)
+ *
+ * Cf. hv_pci_onchannelcallback().
+ */
+ vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt);
free_int_desc:
kfree(int_desc);
drop_reference:
@@ -2079,12 +2216,17 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus)
}
}
if (high_size <= 1 && low_size <= 1) {
- /* Set the memory enable bit. */
- _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
- &command);
- command |= PCI_COMMAND_MEMORY;
- _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
- command);
+ /*
+ * No need to set the PCI_COMMAND_MEMORY bit as
+ * the core PCI driver doesn't require the bit
+ * to be pre-set. Actually here we intentionally
+ * keep the bit off so that the PCI BAR probing
+ * in the core PCI driver doesn't cause Hyper-V
+ * to unnecessarily unmap/map the virtual BARs
+ * from/to the physical BARs multiple times.
+ * This reduces the VM boot time significantly
+ * if the BAR sizes are huge.
+ */
break;
}
}
@@ -2155,8 +2297,17 @@ static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
if (!hv_dev)
continue;
- if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY)
- set_dev_node(&dev->dev, hv_dev->desc.virtual_numa_node);
+ if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
+ hv_dev->desc.virtual_numa_node < num_possible_nodes())
+ /*
+ * The kernel may boot with some NUMA nodes offline
+ * (e.g. in a KDUMP kernel) or with NUMA disabled via
+ * "numa=off". In those cases, adjust the host provided
+ * NUMA node to a valid NUMA node used by the kernel.
+ */
+ set_dev_node(&dev->dev,
+ numa_map_to_online_node(
+ hv_dev->desc.virtual_numa_node));
put_pcichild(hv_dev);
}
@@ -2211,12 +2362,14 @@ static void q_resource_requirements(void *context, struct pci_response *resp,
struct q_res_req_compl *completion = context;
struct pci_q_res_req_response *q_res_req =
(struct pci_q_res_req_response *)resp;
+ s32 status;
int i;
- if (resp->status < 0) {
+ status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status;
+ if (status < 0) {
dev_err(&completion->hpdev->hbus->hdev->device,
"query resource requirements failed: %x\n",
- resp->status);
+ status);
} else {
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
completion->hpdev->probed_bar[i] =
@@ -2640,7 +2793,7 @@ static void hv_eject_device_work(struct work_struct *work)
ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
- sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
+ sizeof(*ejct_pkt), 0,
VM_PKT_DATA_INBAND, 0);
/* For the get_pcichild() in hv_pci_eject_device() */
@@ -2687,8 +2840,9 @@ static void hv_pci_onchannelcallback(void *context)
const int packet_size = 0x100;
int ret;
struct hv_pcibus_device *hbus = context;
+ struct vmbus_channel *chan = hbus->hdev->channel;
u32 bytes_recvd;
- u64 req_id;
+ u64 req_id, req_addr;
struct vmpacket_descriptor *desc;
unsigned char *buffer;
int bufferlen = packet_size;
@@ -2700,14 +2854,15 @@ static void hv_pci_onchannelcallback(void *context)
struct pci_dev_inval_block *inval;
struct pci_dev_incoming *dev_message;
struct hv_pci_dev *hpdev;
+ unsigned long flags;
buffer = kmalloc(bufferlen, GFP_ATOMIC);
if (!buffer)
return;
while (1) {
- ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
- bufferlen, &bytes_recvd, &req_id);
+ ret = vmbus_recvpacket_raw(chan, buffer, bufferlen,
+ &bytes_recvd, &req_id);
if (ret == -ENOBUFS) {
kfree(buffer);
@@ -2734,15 +2889,29 @@ static void hv_pci_onchannelcallback(void *context)
switch (desc->type) {
case VM_PKT_COMP:
+ lock_requestor(chan, flags);
+ req_addr = __vmbus_request_addr_match(chan, req_id,
+ VMBUS_RQST_ADDR_ANY);
+ if (req_addr == VMBUS_RQST_ERROR) {
+ unlock_requestor(chan, flags);
+ dev_err(&hbus->hdev->device,
+ "Invalid transaction ID %llx\n",
+ req_id);
+ break;
+ }
+ comp_packet = (struct pci_packet *)req_addr;
+ response = (struct pci_response *)buffer;
/*
- * The host is trusted, and thus it's safe to interpret
- * this transaction ID as a pointer.
+ * Call ->completion_func() within the critical section to make
+ * sure that the packet pointer is still valid during the call:
+ * here 'valid' means that there's a task still waiting for the
+ * completion, and that the packet data is still on the waiting
+ * task's stack. Cf. hv_compose_msi_msg().
*/
- comp_packet = (struct pci_packet *)req_id;
- response = (struct pci_response *)buffer;
comp_packet->completion_func(comp_packet->compl_ctxt,
response,
bytes_recvd);
+ unlock_requestor(chan, flags);
break;
case VM_PKT_DATA_INBAND:
@@ -2752,7 +2921,8 @@ static void hv_pci_onchannelcallback(void *context)
case PCI_BUS_RELATIONS:
bus_rel = (struct pci_bus_relations *)buffer;
- if (bytes_recvd <
+ if (bytes_recvd < sizeof(*bus_rel) ||
+ bytes_recvd <
struct_size(bus_rel, func,
bus_rel->device_count)) {
dev_err(&hbus->hdev->device,
@@ -2766,7 +2936,8 @@ static void hv_pci_onchannelcallback(void *context)
case PCI_BUS_RELATIONS2:
bus_rel2 = (struct pci_bus_relations2 *)buffer;
- if (bytes_recvd <
+ if (bytes_recvd < sizeof(*bus_rel2) ||
+ bytes_recvd <
struct_size(bus_rel2, func,
bus_rel2->device_count)) {
dev_err(&hbus->hdev->device,
@@ -2780,6 +2951,11 @@ static void hv_pci_onchannelcallback(void *context)
case PCI_EJECT:
dev_message = (struct pci_dev_incoming *)buffer;
+ if (bytes_recvd < sizeof(*dev_message)) {
+ dev_err(&hbus->hdev->device,
+ "eject message too small\n");
+ break;
+ }
hpdev = get_pcichild_wslot(hbus,
dev_message->wslot.slot);
if (hpdev) {
@@ -2791,6 +2967,11 @@ static void hv_pci_onchannelcallback(void *context)
case PCI_INVALIDATE_BLOCK:
inval = (struct pci_dev_inval_block *)buffer;
+ if (bytes_recvd < sizeof(*inval)) {
+ dev_err(&hbus->hdev->device,
+ "invalidate message too small\n");
+ break;
+ }
hpdev = get_pcichild_wslot(hbus,
inval->wslot.slot);
if (hpdev) {
@@ -3395,6 +3576,15 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->bridge->domain_nr = dom;
#ifdef CONFIG_X86
hbus->sysdata.domain = dom;
+#elif defined(CONFIG_ARM64)
+ /*
+ * Set the PCI bus parent to be the corresponding VMbus
+ * device. Then the VMbus device will be assigned as the
+ * ACPI companion in pcibios_root_bridge_prepare() and
+ * pci_dma_configure() will propagate device coherence
+ * information to devices created on the bus.
+ */
+ hbus->sysdata.parent = hdev->device.parent;
#endif
hbus->hdev = hdev;
@@ -3410,6 +3600,10 @@ static int hv_pci_probe(struct hv_device *hdev,
goto free_dom;
}
+ hdev->channel->next_request_id_callback = vmbus_next_request_id;
+ hdev->channel->request_addr_callback = vmbus_request_addr;
+ hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
+
ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
hv_pci_onchannelcallback, hbus);
if (ret)
@@ -3540,6 +3734,7 @@ free_bus:
static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
{
struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
+ struct vmbus_channel *chan = hdev->channel;
struct {
struct pci_packet teardown_packet;
u8 buffer[sizeof(struct pci_message)];
@@ -3547,13 +3742,14 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
struct hv_pci_compl comp_pkt;
struct hv_pci_dev *hpdev, *tmp;
unsigned long flags;
+ u64 trans_id;
int ret;
/*
* After the host sends the RESCIND_CHANNEL message, it doesn't
* access the per-channel ringbuffer any longer.
*/
- if (hdev->channel->rescind)
+ if (chan->rescind)
return 0;
if (!keep_devs) {
@@ -3590,16 +3786,26 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
pkt.teardown_packet.compl_ctxt = &comp_pkt;
pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
- ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
- sizeof(struct pci_message),
- (unsigned long)&pkt.teardown_packet,
- VM_PKT_DATA_INBAND,
- VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+ ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message,
+ sizeof(struct pci_message),
+ (unsigned long)&pkt.teardown_packet,
+ &trans_id, VM_PKT_DATA_INBAND,
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
if (ret)
return ret;
- if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0)
+ if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) {
+ /*
+ * The completion packet on the stack becomes invalid after
+ * 'return'; remove the ID from the VMbus requestor if the
+ * identifier is still mapped to/associated with the packet.
+ *
+ * Cf. hv_pci_onchannelcallback().
+ */
+ vmbus_request_addr_match(chan, trans_id,
+ (unsigned long)&pkt.teardown_packet);
return -ETIMEDOUT;
+ }
return 0;
}
@@ -3740,6 +3946,10 @@ static int hv_pci_resume(struct hv_device *hdev)
hbus->state = hv_pcibus_init;
+ hdev->channel->next_request_id_callback = vmbus_next_request_id;
+ hdev->channel->request_addr_callback = vmbus_request_addr;
+ hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
+
ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
hv_pci_onchannelcallback, hbus);
if (ret)