sriov功能的简单理解

B站上有两个视频讲的挺好的,从普通模型的收包处理流程(两次中断和一次复制),到vmdq(一次中断和一次复制),再到sriov(一次中断)。讲述了模型的一步步优化。
sriov简单理解

其它参考文章:
1:基于sriov寄存器原理讲解的
2:https://projectacrn.github.io/latest/tutorials/sriov_virtualization.html#sr
3:https://access.redhat.com/documentation/zh-cn/red_hat_enterprise_linux_openstack_platform/7/html/networking_guide/sec-sr-iov

sriov功能涉及的api主要如下所示

#ifdef CONFIG_PCI_IOV
int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
 
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_iov_add_virtfn(struct pci_dev *dev, int id);
void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
int pci_num_vf(struct pci_dev *dev);
int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
 
/* Arch may override these (weak) */
int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
int pcibios_sriov_disable(struct pci_dev *pdev);
resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
#else
static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
{
    return -ENOSYS;
}

参考:sriov主要api

sriov api理解分析

#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/delay.h>
#include "pci.h"
#define VIRTFN_ID_LEN  16
/*
  FirstVF Offset:第一个VF相对PF的Routing ID的偏移量
  VF Stride: 相邻VF之间的Routing ID的偏移量(步进值)
  8bit bus number、5bitdevice number、3bit function number
  Routing ID = (PF Routing ID + First VF Offset + (N-1) * VF Stride)
  vf_bus = pf_bus + (pf_devfn + offset + stride * vf_id) >> 8
  vf_devfn = (pf_devfn + offset + stride * vf_id) & 0xff
*/
int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id)
{
    if (!dev->is_physfn)
        return -EINVAL;
    return dev->bus->number + ((dev->devfn + dev->sriov->offset +
                    dev->sriov->stride * vf_id) >> 8);
}
int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id)
{
    if (!dev->is_physfn)
        return -EINVAL;
    return (dev->devfn + dev->sriov->offset +
        dev->sriov->stride * vf_id) & 0xff;
}
/*根据不同的vf得到对应的offset和stride*/
static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
{
    struct pci_sriov *iov = dev->sriov;

    pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
    pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
    pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
}
static int compute_max_vf_buses(struct pci_dev *dev)
{
    struct pci_sriov *iov = dev->sriov;
    int nr_virtfn, busnr, rc = 0;
    for (nr_virtfn = iov->total_VFs; nr_virtfn; nr_virtfn--) {
        /*这里要根据不同的vf得到对应相对应的offset和stride
        随后用于求出vf(n)对应的bus number*/
        pci_iov_set_numvfs(dev, nr_virtfn);
        if (!iov->offset || (nr_virtfn > 1 && !iov->stride)) {
            rc = -EIO;
            goto out;
        }
        busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
        if (busnr > iov->max_VF_buses)
            iov->max_VF_buses = busnr;
    }
out:
    pci_iov_set_numvfs(dev, 0);
    return rc;
}

/********************************************************************************************/
resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
{
    if (!dev->is_physfn)
        return 0;
    return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
}
resource_size_t __weak pcibios_iov_resource_alignment(struct pci_dev *dev,int resno)
{
    return pci_iov_resource_size(dev, resno);
}
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
{
    return pcibios_iov_resource_alignment(dev, resno);
}
/*****************************************************************************************/
static ssize_t sriov_totalvfs_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
}
static ssize_t sriov_numvfs_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    u16 num_vfs;
    /* Serialize vs sriov_numvfs_store() so readers see valid num_VFs */
    device_lock(&pdev->dev);
    num_vfs = pdev->sriov->num_VFs;
    device_unlock(&pdev->dev);
    return sprintf(buf, "%u\n", num_vfs);
}
static ssize_t sriov_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
     struct pci_dev *pdev = to_pci_dev(dev);
     return sprintf(buf, "%u\n", pdev->sriov->offset);
}
static ssize_t sriov_stride_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
     struct pci_dev *pdev = to_pci_dev(dev);
     return sprintf(buf, "%u\n", pdev->sriov->stride);
}
static ssize_t sriov_vf_device_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    return sprintf(buf, "%x\n", pdev->sriov->vf_device);
}
static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
}
/***************************************************************************/
static DEVICE_ATTR_RO(sriov_totalvfs);
static DEVICE_ATTR_RW(sriov_numvfs);
static DEVICE_ATTR_RO(sriov_offset);
static DEVICE_ATTR_RO(sriov_stride);
static DEVICE_ATTR_RO(sriov_vf_device);
static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
static struct attribute *sriov_dev_attrs[] = {
    &dev_attr_sriov_totalvfs.attr,
    &dev_attr_sriov_numvfs.attr,
    &dev_attr_sriov_offset.attr,
    &dev_attr_sriov_stride.attr,
    &dev_attr_sriov_vf_device.attr,
    &dev_attr_sriov_drivers_autoprobe.attr,
    NULL,
};
static umode_t sriov_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
    struct device *dev = kobj_to_dev(kobj);
    if (!dev_is_pf(dev))
        return 0;
    return a->mode;
}
const struct attribute_group sriov_dev_attr_group = {
    .attrs = sriov_dev_attrs,
    .is_visible = sriov_attrs_are_visible,
};
/**************************************************使能sriov功能***************************************************/
/*设置sriov_numvfs数量时,会进入该函数*/
static ssize_t sriov_numvfs_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    int ret;
    u16 num_vfs;
    /*字符串转整数,buf输入(应该是输入shell命令时
    配置num_vfs输入的字符串),num_vfs输出*/
    ret = kstrtou16(buf, 0, &num_vfs);
    /*判断是否转换成功*/
    if (ret < 0)
        return ret;
    /*应该是判断设置的vf数量有没有超过系统支持的*/
    if (num_vfs > pci_sriov_get_totalvfs(pdev))
        return -ERANGE;
    device_lock(&pdev->dev);
    /*如果设置的vfs数量和用来的一样,本次的设置应该是没有意义的
    这里直接退出了*/
    if (num_vfs == pdev->sriov->num_VFs)
        goto exit;
    /* is PF driver loaded w/callback
    从这里进行判断,如果要设置vfs成功,还要实现configure的
    回调函数,否则会设置不成功*/
    if (!pdev->driver || !pdev->driver->sriov_configure) {
        pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n");
        ret = -ENOENT;
        goto exit;
    }
    /*如果设置的数量为0,则代表disable VF*/
    if (num_vfs == 0) {
        /* disable VFs */
        ret = pdev->driver->sriov_configure(pdev, 0);
        goto exit;
    }
    /* enable VFs -->
    这里应该是重新设置vfs时,如果之前num_VFs大于0,
    则要先disable,才能重新设置vfs*/
    if (pdev->sriov->num_VFs) {
        pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n",
             pdev->sriov->num_VFs, num_vfs);
        ret = -EBUSY;
        goto exit;
    }
    /*sriov_configure
    根据pci.c file(3225 line)的赋值,
    这里应该是调用pci_sriov_configure_simple函数进行真正的
    num_vfs 设置,返回值小于零则函数执行出错了,执行正确
    这里返回值ret和num_vfs会相等*/
    ret = pdev->driver->sriov_configure(pdev, num_vfs);
    if (ret < 0)
        goto exit;
    if (ret != num_vfs)
        pci_warn(pdev, "%d VFs requested; only %d enabled\n",
             num_vfs, ret);
exit:
    device_unlock(&pdev->dev);
    if (ret < 0)
        return ret;
    return count;
}
int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
{
    int rc;
    might_sleep();
    /*这个位域段表示这个设备是不是PF设备
    参考:https://blog.csdn.net/scarecrow_byr/article/details/103248133*/
    if (!dev->is_physfn)
        return -ENODEV;
    /*函数用于返回属于此(dev)设备的vf数量,如果设备不是pf
    设备则直接返回0*/
    if (pci_vfs_assigned(dev)) {
        pci_warn(dev, "Cannot modify SR-IOV while VFs are assigned\n");
        return -EPERM;
    }
    /*一般不会是0*/
    if (nr_virtfn == 0) {
        sriov_disable(dev);
        return 0;
    }
    /*真正的进入enable阶段*/
    rc = sriov_enable(dev, nr_virtfn);
    if (rc < 0)
        return rc;
    return nr_virtfn;
}
EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
/*
 * dev : pci设备(PF)
 * nr_virtfn(要开启的vf数量)
*/
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{
    might_sleep();
    /* 判断是否是物理设备(PF) 只有物理
    设备才能开启sriov功能*/
    if (!dev->is_physfn)
        return -ENOSYS;
    return sriov_enable(dev, nr_virtfn);
}
EXPORT_SYMBOL_GPL(pci_enable_sriov);
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
    int rc;
    int i;
    int nres;
    u16 initial;
    struct resource *res;
    struct pci_dev *pdev;
    /*这个在sriov 初始化的时候申请的内存*/
    struct pci_sriov *iov = dev->sriov;
    int bars = 0;
    int bus;
    /*判断vf的数值是否为0*/
    if (!nr_virtfn)
        return 0;
    /*可见 vfs,
    NumVFs字段包含当前配置使用的VF数量*/
    if (iov->num_VFs)
        return -EINVAL;
    /*读初始vf的值,并判断是否合法,
    根据文档说明,初始值initial和total_VFs是相等的。
    PCI_SRIOV_CAP_VFM字段默认值是0,取反为1,因为
    (initial 和 iov->total_VFs)一开始是相等的,所以,第一个
    if语句以及第二个if语句不会成立*/
    pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
    if (initial > iov->total_VFs ||
        (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total_VFs)))
        return -EIO;
    if (nr_virtfn < 0 || nr_virtfn > iov->total_VFs ||
        (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
        return -EINVAL;
    /**/
    nres = 0;
    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
        /*bars = bars | ((1 << (i + PCI_IOV_RESOURCES)))*/
        bars |= (1 << (i + PCI_IOV_RESOURCES));
        res = &dev->resource[i + PCI_IOV_RESOURCES];
        if (res->parent)
            nres++;
    }
    if (nres != iov->nres) {
        pci_err(dev, "not enough MMIO resources for SR-IOV\n");
        return -ENOMEM;
    }
    /*计算出vf bus*/
    bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
    if (bus > dev->bus->busn_res.end) {
        pci_err(dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
            nr_virtfn, bus, &dev->bus->busn_res);
        return -ENOMEM;
    }
    /* 使能bar resource*/
    if (pci_enable_resources(dev, bars)) {
        pci_err(dev, "SR-IOV: IOV BARS not allocated\n");
        return -ENOMEM;
    }
    /*???*/
    if (iov->link != dev->devfn) {
        pdev = pci_get_slot(dev->bus, iov->link);
        if (!pdev)
            return -ENODEV;
        if (!pdev->is_physfn) {
            pci_dev_put(pdev);
            return -ENOSYS;
        }
        rc = sysfs_create_link(&dev->dev.kobj,
                    &pdev->dev.kobj, "dep_link");
        pci_dev_put(pdev);
        if (rc)
            return rc;
    }
    /*这里做一下成员的赋值*/
    iov->initial_VFs = initial;
    /*这里应该不会成立,因为一开始
    initial等于total_VFs,除非随意给了
    一个nr_virtfn大于total_VFs的值进来*/
    if (nr_virtfn < initial)
        initial = nr_virtfn;
    rc = pcibios_sriov_enable(dev, initial);
    if (rc) {
        pci_err(dev, "failure %d from pcibios_sriov_enable()\n", rc);
        goto err_pcibios;
    }
    /*写入寄存器*/
    pci_iov_set_numvfs(dev, nr_virtfn);
    /* iov->ctrl = iov->ctrl | (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
    iov->ctrl = iov->ctrl | (00000000 00000001 | 00000000 0000 1000);
    相当于在原来的基础上把PCI_SRIOV_CTRL_VFE和PCI_SRIOV_CTRL_MSE
    功能开启*/
    iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
    pci_cfg_access_lock(dev);
    pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
    msleep(100);
    pci_cfg_access_unlock(dev);
    /*这里应该和pcie初始化的流程基本一致,分配填充结构体,初始化设备,add设备*/
    rc = sriov_add_vfs(dev, initial);
    if (rc)
        goto err_pcibios;
    /**/
    kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
    /*成员赋值*/
    iov->num_VFs = nr_virtfn;
    return 0;
err_pcibios:
    iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
    pci_cfg_access_lock(dev);
    pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
    ssleep(1);
    pci_cfg_access_unlock(dev);
    pcibios_sriov_disable(dev);
    if (iov->link != dev->devfn)
        sysfs_remove_link(&dev->dev.kobj, "dep_link");
    pci_iov_set_numvfs(dev, 0);
    return rc;
}

int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
{
    return 0;
}
/*****************************************delete*********************************************************************/
static void sriov_del_vfs(struct pci_dev *dev)
{
    struct pci_sriov *iov = dev->sriov;
    int i;
    for (i = 0; i < iov->num_VFs; i++)
        pci_iov_remove_virtfn(dev, i);
}
/***************************************release****************************************/
static void sriov_release(struct pci_dev *dev)
{
    /*调试*/
    BUG_ON(dev->sriov->num_VFs);
    if (dev != dev->sriov->dev)
        pci_dev_put(dev->sriov->dev);
    /*在初始化阶段申请的iov*/
    kfree(dev->sriov);
    dev->sriov = NULL;
}
void pci_iov_release(struct pci_dev *dev)
{
    if (dev->is_physfn)
        sriov_release(dev);
}
/********************************remove(模块卸载或者设备移除时执行)*********************************************************/
void pci_iov_remove(struct pci_dev *dev)
{
    struct pci_sriov *iov = dev->sriov;
    if (!dev->is_physfn)
        return;
    iov->driver_max_VFs = iov->total_VFs;
    if (iov->num_VFs)
        pci_warn(dev, "driver left SR-IOV enabled after remove\n");
}
static void virtfn_remove_bus(struct pci_bus *physbus, struct pci_bus *virtbus)
{
    if (physbus != virtbus && list_empty(&virtbus->devices))
        pci_remove_bus(virtbus);
}
void pci_iov_remove_virtfn(struct pci_dev *dev, int id)
{
    char buf[VIRTFN_ID_LEN];
    struct pci_dev *virtfn;
    virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
                         pci_iov_virtfn_bus(dev, id),
                         pci_iov_virtfn_devfn(dev, id));
    if (!virtfn)
        return;
    sprintf(buf, "virtfn%u", id);
    sysfs_remove_link(&dev->dev.kobj, buf);
    if (virtfn->dev.kobj.sd)
        sysfs_remove_link(&virtfn->dev.kobj, "physfn");
    pci_stop_and_remove_bus_device(virtfn);
    virtfn_remove_bus(dev->bus, virtfn->bus);
    /* balance pci_get_domain_bus_and_slot() */
    pci_dev_put(virtfn);
    pci_dev_put(dev);
}
/****************************sriov disable**********************************************************/
int __weak pcibios_sriov_disable(struct pci_dev *pdev)
{
    return 0;
}
static void sriov_disable(struct pci_dev *dev)
{
    struct pci_sriov *iov = dev->sriov;
    if (!iov->num_VFs)
        return;
    sriov_del_vfs(dev);
    iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
    pci_cfg_access_lock(dev);
    pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
    ssleep(1);
    pci_cfg_access_unlock(dev);
    pcibios_sriov_disable(dev);
    if (iov->link != dev->devfn)
        sysfs_remove_link(&dev->dev.kobj, "dep_link");
    iov->num_VFs = 0;
    pci_iov_set_numvfs(dev, 0);
}
void pci_disable_sriov(struct pci_dev *dev)
{
    might_sleep();
    if (!dev->is_physfn)
        return;
    sriov_disable(dev);
}
EXPORT_SYMBOL_GPL(pci_disable_sriov);
/****************************************init*************************************************************/
/*总结:sriov的初始化就是给vf的bar空间赋值,
  并决定当前pf支持几个vf,并给每个vf编个号,
  sriov_init只是初始化,如果需要使用vf的话,
  还必须主动调用pci_enable_sriov来指定需要启动哪个vf。
*/
static int sriov_init(struct pci_dev *dev, int pos)
{
    int i, bar64;
    int rc;
    int nres;
    u32 pgsz;
    u16 ctrl, total;
    struct pci_sriov *iov;
    struct resource *res;
    struct pci_dev *pdev;
    /*应该是初始化阶段还不要进行VF enable,
    所以,如果读到的值为1,则进行写0操作*/
    pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
    /*将读取到的ctrl(2字节)字段与PCI_SRIOV_CTRL_VFE相与(第一个比特位)
    判断是否开启VF enable功能*/
    if (ctrl & PCI_SRIOV_CTRL_VFE) {
        /*这里是初始化,PCI_SRIOV_CTRL字段默认值应该都是0*/
        pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
        ssleep(1);
    }
    /*这个循环暂时不太清除具体作用*/
    ctrl = 0;
    list_for_each_entry(pdev, &dev->bus->devices, bus_list)
        if (pdev->is_physfn)
            goto found;
    pdev = NULL;
    if (pci_ari_enabled(dev->bus))
        ctrl |= PCI_SRIOV_CTRL_ARI;
found:
    pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
    /*从配置空间中找到当前支持几个vf*/
    pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
    if (!total)
        return 0;
    /*PF支持的页面大小,只读
    System Page Size = 1,则表示页大小为4KB。
    表示该PF的所有VF的bar必须以System Page Size对齐
    PF支持的页大小的集合;每个bit代表一个页大小;
    如果第n个比特置一,则支持2**(n + 12)字节的页大小,0 <= n <= 31;
    示例:Supported Page Size = 0x00000553,
    则表示支持4KB, 8KB, 64KB, 256KB, 1MB, 4MB大小的页。*/
    pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
    i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
    pgsz &= ~((1 << i) - 1);
    if (!pgsz)
        return -EIO;
    /*系统页面大小,
    由软件配置;仅能有一个比特置一;
    置一的比特必须在Supported Page Size集合中;
    示例:System Page Size = 1,则表示页大小为4KB。
    含义:表示该PF的所有VF的bar必须以System Page Size对齐;*/
    pgsz &= ~(pgsz - 1);
    pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
    /*申请一个iov*/
    iov = kzalloc(sizeof(*iov), GFP_KERNEL);
    if (!iov)
        return -ENOMEM;
    /*for循环用来给vf的bar空间赋值*/
    nres = 0;
    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
        /*res指向dev sriov部分的资源地址*/
        res = &dev->resource[i + PCI_IOV_RESOURCES];
        /*这里主要是判断资源类型*/
        if (res->flags & IORESOURCE_PCI_FIXED)
            bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
        else
            /*如果是64位的,那么是两个bar当一个bar使用,注意后面还有的操作i += bar64*/
            bar64 = __pci_read_base(dev, pci_bar_unknown, res, pos + PCI_SRIOV_BAR + i * 4);
        /*这个猜测是判断bar里的资源是否可用,如果在pci设备枚举的时候,没有分配,那这里就
        不分配了*/
        if (!res->flags)
            continue;
        /*边界对齐*/
        if (resource_size(res) & (PAGE_SIZE - 1)) {
            rc = -EIO;
            goto failed;
        }
        /*iov的bar赋值*/
        iov->barsz[i] = resource_size(res);
        /*后面vf0到vfn的bar都从iov->barsz[i]这里分配了,
        例如i等于0时,vf0-vfn的bar0都从iov->barsz[0]分配,
        依次类推*/
        res->end = res->start + resource_size(res) * total - 1;
        pci_info(dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n", i, res, i, total);
        i += bar64;
        /*资源引用数*/
        nres++;
    }
    /*iov其它成员赋值*/
    iov->pos = pos;
    iov->nres = nres;
    iov->ctrl = ctrl;
    iov->total_VFs = total;
    iov->driver_max_VFs = total;
    pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
    iov->pgsz = pgsz;
    iov->self = dev;
    iov->drivers_autoprobe = true;
    /*SR-IOV Capabilities*/
    pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
    /*SR-IOV fun dep link*/
    pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
    if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)
        iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link);
    /*这个和上面的for循环链表有关系,?*/
    if (pdev)
        iov->dev = pci_dev_get(pdev);
    else
        iov->dev = dev;
    /*将iov赋值给dev->sriov,这个dev代表当前的pcie设备也就是pf*/
    dev->sriov = iov;
    /*声明为物理设备(PF)*/
    dev->is_physfn = 1;
    /*主要是给iov->max_VF_buses进行赋值*/
    rc = compute_max_vf_buses(dev);
    if (rc)
        goto fail_max_buses;
    return 0;
fail_max_buses:
    dev->sriov = NULL;
    dev->is_physfn = 0;
failed:
    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
        res = &dev->resource[i + PCI_IOV_RESOURCES];
        res->flags = 0;
    }
    kfree(iov);
    return rc;
}
int pci_iov_init(struct pci_dev *dev)
{
    int pos;
    /*判断是否是pcie设备,pcie设备有pcie功能,
    pci设备无sriov功能*/
    if (!pci_is_pcie(dev))
        return -ENODEV;
    /*判断这款pcie设备是否有sriov feature*/
    pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
    /*有的话就调用sriov_init函数做vf的初始化,注意这里的返回值,
    (pos)返回的是SRIOV扩展能力空间的起始地址*/
    if (pos)
        return sriov_init(dev, pos);
    return -ENODEV;
}
/******************************************************************************************/
void pci_iov_update_resource(struct pci_dev *dev, int resno)
{
    struct pci_sriov *iov = dev->is_physfn ? dev->sriov : NULL;
    struct resource *res = dev->resource + resno;
    int vf_bar = resno - PCI_IOV_RESOURCES;
    struct pci_bus_region region;
    u16 cmd;
    u32 new;
    int reg;
    if (!iov)
        return;
    pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &cmd);
    if ((cmd & PCI_SRIOV_CTRL_VFE) && (cmd & PCI_SRIOV_CTRL_MSE)) {
        dev_WARN(&dev->dev, "can't update enabled VF BAR%d %pR\n",
             vf_bar, res);
        return;
    }
    if (!res->flags)
        return;
    if (res->flags & IORESOURCE_UNSET)
        return;
    if (res->flags & IORESOURCE_PCI_FIXED)
        return;
    pcibios_resource_to_bus(dev->bus, &region, res);
    new = region.start;
    new |= res->flags & ~PCI_BASE_ADDRESS_MEM_MASK;
    reg = iov->pos + PCI_SRIOV_BAR + 4 * vf_bar;
    pci_write_config_dword(dev, reg, new);
    if (res->flags & IORESOURCE_MEM_64) {
        new = region.start >> 16 >> 16;
        pci_write_config_dword(dev, reg + 4, new);
    }
}
/****************************************************************************/
static void sriov_restore_state(struct pci_dev *dev)
{
    int i;
    u16 ctrl;
    struct pci_sriov *iov = dev->sriov;
    pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
    if (ctrl & PCI_SRIOV_CTRL_VFE)
        return;
    ctrl &= ~PCI_SRIOV_CTRL_ARI;
    ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI;
    pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
        pci_update_resource(dev, i + PCI_IOV_RESOURCES);
    pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
    pci_iov_set_numvfs(dev, iov->num_VFs);
    pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
    if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
        msleep(100);
}
void pci_restore_iov_state(struct pci_dev *dev)
{
    if (dev->is_physfn)
        sriov_restore_state(dev);
}
/**********************************************************************************/
void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool auto_probe)
{
    if (dev->is_physfn)
        dev->sriov->drivers_autoprobe = auto_probe;
}
static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
    struct pci_dev *pdev = to_pci_dev(dev);
    bool drivers_autoprobe;
    if (kstrtobool(buf, &drivers_autoprobe) < 0)
        return -EINVAL;
    pdev->sriov->drivers_autoprobe = drivers_autoprobe;
    return count;
}
/**************************************************************************/
int pci_iov_bus_range(struct pci_bus *bus)
{
    int max = 0;
    struct pci_dev *dev;
    /*寻找总线资源*/
    list_for_each_entry(dev, &bus->devices, bus_list) {
        if (!dev->is_physfn)
            continue;
        if (dev->sriov->max_VF_buses > max)
            max = dev->sriov->max_VF_buses;
    }
    return max ? max - bus->number : 0;
}
/***************************************************************************************/
/*这里为什么返回的是driver_max_VFs而不是totalvfs*/
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
{
    if (!dev->is_physfn)
        return -ENOSYS;
    if (numvfs > dev->sriov->total_VFs)
        return -EINVAL;
    /* Shouldn't change if VFs already enabled */
    if (dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
        return -EBUSY;
    dev->sriov->driver_max_VFs = numvfs;
    return 0;
}
EXPORT_SYMBOL_GPL(pci_sriov_set_totalvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev)
{
    if (!dev->is_physfn)
        return 0;
    return dev->sriov->driver_max_VFs;
}
EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
int pci_num_vf(struct pci_dev *dev)
{
    if (!dev->is_physfn)
        return 0;
    return dev->sriov->num_VFs;
}
EXPORT_SYMBOL_GPL(pci_num_vf);
/**************************************************************************************/
int pci_vfs_assigned(struct pci_dev *dev)
{
    struct pci_dev *vfdev;
    unsigned int vfs_assigned = 0;
    unsigned short dev_id;
    /* only search if we are a PF */
    if (!dev->is_physfn)
        return 0;
    dev_id = dev->sriov->vf_device;
    /* loop through all the VFs to see if we own any that are assigned */
    vfdev = pci_get_device(dev->vendor, dev_id, NULL);
    while (vfdev) {
        if (vfdev->is_virtfn && (vfdev->physfn == dev) &&
            pci_is_dev_assigned(vfdev))
            vfs_assigned++;
        vfdev = pci_get_device(dev->vendor, dev_id, vfdev);
    }
    return vfs_assigned;
}
EXPORT_SYMBOL_GPL(pci_vfs_assigned);
static int sriov_add_vfs(struct pci_dev *dev, u16 num_vfs)
{
    unsigned int i;
    int rc;
    if (dev->no_vf_scan)
        return 0;
    /*逐个增加vf*/
    for (i = 0; i < num_vfs; i++) {
        rc = pci_iov_add_virtfn(dev, i);
        if (rc)
            goto failed;
    }
    return 0;
failed:
    while (i--)
        pci_iov_remove_virtfn(dev, i);
    return rc;
}
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
{
    struct pci_bus *child;
    if (bus->number == busnr)
        return bus;
    child = pci_find_bus(pci_domain_nr(bus), busnr);
    if (child)
        return child;
    child = pci_add_new_bus(bus, NULL, busnr);
    if (!child)
        return NULL;
    pci_bus_insert_busn_res(child, busnr, busnr);
    return child;
}
static void pci_read_vf_config_common(struct pci_dev *virtfn)
{
    struct pci_dev *physfn = virtfn->physfn;
    pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
                  &physfn->sriov->class);
    pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
                 &physfn->sriov->hdr_type);
    pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
                 &physfn->sriov->subsystem_vendor);
    pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
                 &physfn->sriov->subsystem_device);
}
int pci_iov_sysfs_link(struct pci_dev *dev,
        struct pci_dev *virtfn, int id)
{
    char buf[VIRTFN_ID_LEN];
    int rc;
    sprintf(buf, "virtfn%u", id);
    rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
    if (rc)
        goto failed;
    rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
    if (rc)
        goto failed1;
    kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
    return 0;
failed1:
    sysfs_remove_link(&dev->dev.kobj, buf);
failed:
    return rc;
}
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
    int i;
    int rc = -ENOMEM;
    u64 size;
    struct pci_dev *virtfn;
    struct resource *res;
    struct pci_sriov *iov = dev->sriov;
    struct pci_bus *bus;
    /*虚拟总线bus*/
    bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
    if (!bus)
        goto failed;
    /*从总线当中申请一个pci设备*/
    virtfn = pci_alloc_dev(bus);
    if (!virtfn)
        goto failed0;
    /*DF赋值*/
    virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
    /*vendor id赋值*/
    virtfn->vendor = dev->vendor;
    /*device id赋值*/
    virtfn->device = iov->vf_device;
    /*声明为虚拟设备(VF)*/
    virtfn->is_virtfn = 1;
    /*vf 相关pf*/
    virtfn->physfn = pci_dev_get(dev);
    /*??*/
    virtfn->no_command_memory = 1;
    /*其它成员的赋值,不过为什么是只有id为0时进来?*/
    if (id == 0)
        pci_read_vf_config_common(virtfn);
    /*对这个pf(把vf的一些值赋给pf的成员变量)设备进行一些设置,
    例如中断设置,以及这个假的vf设备(从结构体本身而言是pf设备)
    resource变量(flag)赋值等等*/
    rc = pci_setup_device(virtfn);
    if (rc)
        goto failed1;
    /*设备父节点赋值,从赋值情况来看
    vf设备的父节点设备和他关联的pf设备是一样的*/
    virtfn->dev.parent = dev->dev.parent;
    /*单功能设备*/
    virtfn->multifunction = 0;
    /*设备的bar资源从初始化的时候那里拿过来依次分配*/
    for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
        /*从pci部分开始的赋值*/
        res = &dev->resource[i + PCI_IOV_RESOURCES];
        if (!res->parent)
            continue;
        virtfn->resource[i].name = pci_name(virtfn);
        virtfn->resource[i].flags = res->flags;
        size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
        virtfn->resource[i].start = res->start + size * id;
        virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
        rc = request_resource(res, &virtfn->resource[i]);
        BUG_ON(rc);
    }
    /*先追加一个设备,再在总线上对这个设备进行追加*/
    pci_device_add(virtfn, virtfn->bus);
    /**/
    rc = pci_iov_sysfs_link(dev, virtfn, id);
    if (rc)
        goto failed1;
    /*这个时候可以在总线上追加一个设备了,
    最后追加的其实是一个pf设备,只不过把这个pf
    设备当成vf设备使用了*/
    pci_bus_add_device(virtfn);
    return 0;
failed1:
    pci_stop_and_remove_bus_device(virtfn);
    pci_dev_put(dev);
failed0:
    virtfn_remove_bus(dev->bus, bus);
failed:
    return rc;
}

总结:这部分代码量比较庞大,涉及的东西比较多。通过阅读,要真正的掌握这部分代码,我的感受有以下几点。
1:由于在enable sriov时需要指定num_vfs,这里在添加一个vf设备时涉及的过程和pci设备枚举是一样的,由于自己对这部分不是非常的熟悉,所以这里有些部分的代码无法吃透。可能后续我会先去阅读pcie枚举部分的代码,在回来看sriov

2:在阅读时,大部分操作涉及寄存器的赋值,这里推荐以下这个网址
https://www.intel.cn/content/www/cn/zh/docs/programmable/683111/17-1/initial-vfs-and-total-vfs-registers.html

3:有些细节问题需要多加打印语句才能理解,在阅读这部分代码时我没有调试,这是我后续需要改进的地方。

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐