sriov驱动API总结
一:API概览主要涉及以下的API#ifdef CONFIG_PCI_IOVint pci_iov_virtfn_bus(struct pci_dev *dev, int id);int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
sriov功能的简单理解
B站上有两个视频讲的挺好的,从普通模型的收包处理流程(两次中断和一次复制),到vmdq(一次中断和一次复制),再到sriov(一次中断)。讲述了模型的一步步优化。
sriov简单理解
其它参考文章:
1:基于sriov寄存器原理讲解的
2:https://projectacrn.github.io/latest/tutorials/sriov_virtualization.html#sr
3:https://access.redhat.com/documentation/zh-cn/red_hat_enterprise_linux_openstack_platform/7/html/networking_guide/sec-sr-iov
sriov功能涉及的api主要如下所示
#ifdef CONFIG_PCI_IOV
int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_iov_add_virtfn(struct pci_dev *dev, int id);
void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
int pci_num_vf(struct pci_dev *dev);
int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
/* Arch may override these (weak) */
int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
int pcibios_sriov_disable(struct pci_dev *pdev);
resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
#else
static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
{
return -ENOSYS;
}
参考:sriov主要api
sriov api理解分析
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/delay.h>
#include "pci.h"
#define VIRTFN_ID_LEN 16
/*
FirstVF Offset:第一个VF相对PF的Routing ID的偏移量
VF Stride: 相邻VF之间的Routing ID的偏移量(步进值)
8bit bus number、5bitdevice number、3bit function number
Routing ID = (PF Routing ID + First VF Offset + (N-1) * VF Stride)
vf_bus = pf_bus + (pf_devfn + offset + stride * vf_id) >> 8
vf_devfn = (pf_devfn + offset + stride * vf_id) & 0xff
*/
int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id)
{
if (!dev->is_physfn)
return -EINVAL;
return dev->bus->number + ((dev->devfn + dev->sriov->offset +
dev->sriov->stride * vf_id) >> 8);
}
int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id)
{
if (!dev->is_physfn)
return -EINVAL;
return (dev->devfn + dev->sriov->offset +
dev->sriov->stride * vf_id) & 0xff;
}
/*根据不同的vf得到对应的offset和stride*/
static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
{
struct pci_sriov *iov = dev->sriov;
pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
}
static int compute_max_vf_buses(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
int nr_virtfn, busnr, rc = 0;
for (nr_virtfn = iov->total_VFs; nr_virtfn; nr_virtfn--) {
/*这里要根据不同的vf得到对应相对应的offset和stride
随后用于求出vf(n)对应的bus number*/
pci_iov_set_numvfs(dev, nr_virtfn);
if (!iov->offset || (nr_virtfn > 1 && !iov->stride)) {
rc = -EIO;
goto out;
}
busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
if (busnr > iov->max_VF_buses)
iov->max_VF_buses = busnr;
}
out:
pci_iov_set_numvfs(dev, 0);
return rc;
}
/********************************************************************************************/
resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
{
if (!dev->is_physfn)
return 0;
return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
}
resource_size_t __weak pcibios_iov_resource_alignment(struct pci_dev *dev,int resno)
{
return pci_iov_resource_size(dev, resno);
}
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
{
return pcibios_iov_resource_alignment(dev, resno);
}
/*****************************************************************************************/
static ssize_t sriov_totalvfs_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
}
static ssize_t sriov_numvfs_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
u16 num_vfs;
/* Serialize vs sriov_numvfs_store() so readers see valid num_VFs */
device_lock(&pdev->dev);
num_vfs = pdev->sriov->num_VFs;
device_unlock(&pdev->dev);
return sprintf(buf, "%u\n", num_vfs);
}
static ssize_t sriov_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
return sprintf(buf, "%u\n", pdev->sriov->offset);
}
static ssize_t sriov_stride_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
return sprintf(buf, "%u\n", pdev->sriov->stride);
}
static ssize_t sriov_vf_device_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
return sprintf(buf, "%x\n", pdev->sriov->vf_device);
}
static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
}
/***************************************************************************/
static DEVICE_ATTR_RO(sriov_totalvfs);
static DEVICE_ATTR_RW(sriov_numvfs);
static DEVICE_ATTR_RO(sriov_offset);
static DEVICE_ATTR_RO(sriov_stride);
static DEVICE_ATTR_RO(sriov_vf_device);
static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
static struct attribute *sriov_dev_attrs[] = {
&dev_attr_sriov_totalvfs.attr,
&dev_attr_sriov_numvfs.attr,
&dev_attr_sriov_offset.attr,
&dev_attr_sriov_stride.attr,
&dev_attr_sriov_vf_device.attr,
&dev_attr_sriov_drivers_autoprobe.attr,
NULL,
};
static umode_t sriov_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = kobj_to_dev(kobj);
if (!dev_is_pf(dev))
return 0;
return a->mode;
}
const struct attribute_group sriov_dev_attr_group = {
.attrs = sriov_dev_attrs,
.is_visible = sriov_attrs_are_visible,
};
/**************************************************使能sriov功能***************************************************/
/*设置sriov_numvfs数量时,会进入该函数*/
static ssize_t sriov_numvfs_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct pci_dev *pdev = to_pci_dev(dev);
int ret;
u16 num_vfs;
/*字符串转整数,buf输入(应该是输入shell命令时
配置num_vfs输入的字符串),num_vfs输出*/
ret = kstrtou16(buf, 0, &num_vfs);
/*判断是否转换成功*/
if (ret < 0)
return ret;
/*应该是判断设置的vf数量有没有超过系统支持的*/
if (num_vfs > pci_sriov_get_totalvfs(pdev))
return -ERANGE;
device_lock(&pdev->dev);
/*如果设置的vfs数量和用来的一样,本次的设置应该是没有意义的
这里直接退出了*/
if (num_vfs == pdev->sriov->num_VFs)
goto exit;
/* is PF driver loaded w/callback
从这里进行判断,如果要设置vfs成功,还要实现configure的
回调函数,否则会设置不成功*/
if (!pdev->driver || !pdev->driver->sriov_configure) {
pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n");
ret = -ENOENT;
goto exit;
}
/*如果设置的数量为0,则代表disable VF*/
if (num_vfs == 0) {
/* disable VFs */
ret = pdev->driver->sriov_configure(pdev, 0);
goto exit;
}
/* enable VFs -->
这里应该是重新设置vfs时,如果之前num_VFs大于0,
则要先disable,才能重新设置vfs*/
if (pdev->sriov->num_VFs) {
pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n",
pdev->sriov->num_VFs, num_vfs);
ret = -EBUSY;
goto exit;
}
/*sriov_configure
根据pci.c file(3225 line)的赋值,
这里应该是调用pci_sriov_configure_simple函数进行真正的
num_vfs 设置,返回值小于零则函数执行出错了,执行正确
这里返回值ret和num_vfs会相等*/
ret = pdev->driver->sriov_configure(pdev, num_vfs);
if (ret < 0)
goto exit;
if (ret != num_vfs)
pci_warn(pdev, "%d VFs requested; only %d enabled\n",
num_vfs, ret);
exit:
device_unlock(&pdev->dev);
if (ret < 0)
return ret;
return count;
}
int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
{
int rc;
might_sleep();
/*这个位域段表示这个设备是不是PF设备
参考:https://blog.csdn.net/scarecrow_byr/article/details/103248133*/
if (!dev->is_physfn)
return -ENODEV;
/*函数用于返回属于此(dev)设备的vf数量,如果设备不是pf
设备则直接返回0*/
if (pci_vfs_assigned(dev)) {
pci_warn(dev, "Cannot modify SR-IOV while VFs are assigned\n");
return -EPERM;
}
/*一般不会是0*/
if (nr_virtfn == 0) {
sriov_disable(dev);
return 0;
}
/*真正的进入enable阶段*/
rc = sriov_enable(dev, nr_virtfn);
if (rc < 0)
return rc;
return nr_virtfn;
}
EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
/*
* dev : pci设备(PF)
* nr_virtfn(要开启的vf数量)
*/
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{
might_sleep();
/* 判断是否是物理设备(PF) 只有物理
设备才能开启sriov功能*/
if (!dev->is_physfn)
return -ENOSYS;
return sriov_enable(dev, nr_virtfn);
}
EXPORT_SYMBOL_GPL(pci_enable_sriov);
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
int rc;
int i;
int nres;
u16 initial;
struct resource *res;
struct pci_dev *pdev;
/*这个在sriov 初始化的时候申请的内存*/
struct pci_sriov *iov = dev->sriov;
int bars = 0;
int bus;
/*判断vf的数值是否为0*/
if (!nr_virtfn)
return 0;
/*可见 vfs,
NumVFs字段包含当前配置使用的VF数量*/
if (iov->num_VFs)
return -EINVAL;
/*读初始vf的值,并判断是否合法,
根据文档说明,初始值initial和total_VFs是相等的。
PCI_SRIOV_CAP_VFM字段默认值是0,取反为1,因为
(initial 和 iov->total_VFs)一开始是相等的,所以,第一个
if语句以及第二个if语句不会成立*/
pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
if (initial > iov->total_VFs ||
(!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total_VFs)))
return -EIO;
if (nr_virtfn < 0 || nr_virtfn > iov->total_VFs ||
(!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
return -EINVAL;
/**/
nres = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
/*bars = bars | ((1 << (i + PCI_IOV_RESOURCES)))*/
bars |= (1 << (i + PCI_IOV_RESOURCES));
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (res->parent)
nres++;
}
if (nres != iov->nres) {
pci_err(dev, "not enough MMIO resources for SR-IOV\n");
return -ENOMEM;
}
/*计算出vf bus*/
bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
if (bus > dev->bus->busn_res.end) {
pci_err(dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
nr_virtfn, bus, &dev->bus->busn_res);
return -ENOMEM;
}
/* 使能bar resource*/
if (pci_enable_resources(dev, bars)) {
pci_err(dev, "SR-IOV: IOV BARS not allocated\n");
return -ENOMEM;
}
/*???*/
if (iov->link != dev->devfn) {
pdev = pci_get_slot(dev->bus, iov->link);
if (!pdev)
return -ENODEV;
if (!pdev->is_physfn) {
pci_dev_put(pdev);
return -ENOSYS;
}
rc = sysfs_create_link(&dev->dev.kobj,
&pdev->dev.kobj, "dep_link");
pci_dev_put(pdev);
if (rc)
return rc;
}
/*这里做一下成员的赋值*/
iov->initial_VFs = initial;
/*这里应该不会成立,因为一开始
initial等于total_VFs,除非随意给了
一个nr_virtfn大于total_VFs的值进来*/
if (nr_virtfn < initial)
initial = nr_virtfn;
rc = pcibios_sriov_enable(dev, initial);
if (rc) {
pci_err(dev, "failure %d from pcibios_sriov_enable()\n", rc);
goto err_pcibios;
}
/*写入寄存器*/
pci_iov_set_numvfs(dev, nr_virtfn);
/* iov->ctrl = iov->ctrl | (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
iov->ctrl = iov->ctrl | (00000000 00000001 | 00000000 0000 1000);
相当于在原来的基础上把PCI_SRIOV_CTRL_VFE和PCI_SRIOV_CTRL_MSE
功能开启*/
iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
msleep(100);
pci_cfg_access_unlock(dev);
/*这里应该和pcie初始化的流程基本一致,分配填充结构体,初始化设备,add设备*/
rc = sriov_add_vfs(dev, initial);
if (rc)
goto err_pcibios;
/**/
kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
/*成员赋值*/
iov->num_VFs = nr_virtfn;
return 0;
err_pcibios:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
ssleep(1);
pci_cfg_access_unlock(dev);
pcibios_sriov_disable(dev);
if (iov->link != dev->devfn)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
pci_iov_set_numvfs(dev, 0);
return rc;
}
int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
{
return 0;
}
/*****************************************delete*********************************************************************/
static void sriov_del_vfs(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
int i;
for (i = 0; i < iov->num_VFs; i++)
pci_iov_remove_virtfn(dev, i);
}
/***************************************release****************************************/
static void sriov_release(struct pci_dev *dev)
{
/*调试*/
BUG_ON(dev->sriov->num_VFs);
if (dev != dev->sriov->dev)
pci_dev_put(dev->sriov->dev);
/*在初始化阶段申请的iov*/
kfree(dev->sriov);
dev->sriov = NULL;
}
void pci_iov_release(struct pci_dev *dev)
{
if (dev->is_physfn)
sriov_release(dev);
}
/********************************remove(模块卸载或者设备移除时执行)*********************************************************/
void pci_iov_remove(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
if (!dev->is_physfn)
return;
iov->driver_max_VFs = iov->total_VFs;
if (iov->num_VFs)
pci_warn(dev, "driver left SR-IOV enabled after remove\n");
}
static void virtfn_remove_bus(struct pci_bus *physbus, struct pci_bus *virtbus)
{
if (physbus != virtbus && list_empty(&virtbus->devices))
pci_remove_bus(virtbus);
}
void pci_iov_remove_virtfn(struct pci_dev *dev, int id)
{
char buf[VIRTFN_ID_LEN];
struct pci_dev *virtfn;
virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
pci_iov_virtfn_bus(dev, id),
pci_iov_virtfn_devfn(dev, id));
if (!virtfn)
return;
sprintf(buf, "virtfn%u", id);
sysfs_remove_link(&dev->dev.kobj, buf);
if (virtfn->dev.kobj.sd)
sysfs_remove_link(&virtfn->dev.kobj, "physfn");
pci_stop_and_remove_bus_device(virtfn);
virtfn_remove_bus(dev->bus, virtfn->bus);
/* balance pci_get_domain_bus_and_slot() */
pci_dev_put(virtfn);
pci_dev_put(dev);
}
/****************************sriov disable**********************************************************/
int __weak pcibios_sriov_disable(struct pci_dev *pdev)
{
return 0;
}
static void sriov_disable(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
if (!iov->num_VFs)
return;
sriov_del_vfs(dev);
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
ssleep(1);
pci_cfg_access_unlock(dev);
pcibios_sriov_disable(dev);
if (iov->link != dev->devfn)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
iov->num_VFs = 0;
pci_iov_set_numvfs(dev, 0);
}
void pci_disable_sriov(struct pci_dev *dev)
{
might_sleep();
if (!dev->is_physfn)
return;
sriov_disable(dev);
}
EXPORT_SYMBOL_GPL(pci_disable_sriov);
/****************************************init*************************************************************/
/*总结:sriov的初始化就是给vf的bar空间赋值,
并决定当前pf支持几个vf,并给每个vf编个号,
sriov_init只是初始化,如果需要使用vf的话,
还必须主动调用pci_enable_sriov来指定需要启动哪个vf。
*/
static int sriov_init(struct pci_dev *dev, int pos)
{
int i, bar64;
int rc;
int nres;
u32 pgsz;
u16 ctrl, total;
struct pci_sriov *iov;
struct resource *res;
struct pci_dev *pdev;
/*应该是初始化阶段还不要进行VF enable,
所以,如果读到的值为1,则进行写0操作*/
pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
/*将读取到的ctrl(2字节)字段与PCI_SRIOV_CTRL_VFE相与(第一个比特位)
判断是否开启VF enable功能*/
if (ctrl & PCI_SRIOV_CTRL_VFE) {
/*这里是初始化,PCI_SRIOV_CTRL字段默认值应该都是0*/
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
ssleep(1);
}
/*这个循环暂时不太清除具体作用*/
ctrl = 0;
list_for_each_entry(pdev, &dev->bus->devices, bus_list)
if (pdev->is_physfn)
goto found;
pdev = NULL;
if (pci_ari_enabled(dev->bus))
ctrl |= PCI_SRIOV_CTRL_ARI;
found:
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
/*从配置空间中找到当前支持几个vf*/
pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
if (!total)
return 0;
/*PF支持的页面大小,只读
System Page Size = 1,则表示页大小为4KB。
表示该PF的所有VF的bar必须以System Page Size对齐
PF支持的页大小的集合;每个bit代表一个页大小;
如果第n个比特置一,则支持2**(n + 12)字节的页大小,0 <= n <= 31;
示例:Supported Page Size = 0x00000553,
则表示支持4KB, 8KB, 64KB, 256KB, 1MB, 4MB大小的页。*/
pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
pgsz &= ~((1 << i) - 1);
if (!pgsz)
return -EIO;
/*系统页面大小,
由软件配置;仅能有一个比特置一;
置一的比特必须在Supported Page Size集合中;
示例:System Page Size = 1,则表示页大小为4KB。
含义:表示该PF的所有VF的bar必须以System Page Size对齐;*/
pgsz &= ~(pgsz - 1);
pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
/*申请一个iov*/
iov = kzalloc(sizeof(*iov), GFP_KERNEL);
if (!iov)
return -ENOMEM;
/*for循环用来给vf的bar空间赋值*/
nres = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
/*res指向dev sriov部分的资源地址*/
res = &dev->resource[i + PCI_IOV_RESOURCES];
/*这里主要是判断资源类型*/
if (res->flags & IORESOURCE_PCI_FIXED)
bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
else
/*如果是64位的,那么是两个bar当一个bar使用,注意后面还有的操作i += bar64*/
bar64 = __pci_read_base(dev, pci_bar_unknown, res, pos + PCI_SRIOV_BAR + i * 4);
/*这个猜测是判断bar里的资源是否可用,如果在pci设备枚举的时候,没有分配,那这里就
不分配了*/
if (!res->flags)
continue;
/*边界对齐*/
if (resource_size(res) & (PAGE_SIZE - 1)) {
rc = -EIO;
goto failed;
}
/*iov的bar赋值*/
iov->barsz[i] = resource_size(res);
/*后面vf0到vfn的bar都从iov->barsz[i]这里分配了,
例如i等于0时,vf0-vfn的bar0都从iov->barsz[0]分配,
依次类推*/
res->end = res->start + resource_size(res) * total - 1;
pci_info(dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n", i, res, i, total);
i += bar64;
/*资源引用数*/
nres++;
}
/*iov其它成员赋值*/
iov->pos = pos;
iov->nres = nres;
iov->ctrl = ctrl;
iov->total_VFs = total;
iov->driver_max_VFs = total;
pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
iov->pgsz = pgsz;
iov->self = dev;
iov->drivers_autoprobe = true;
/*SR-IOV Capabilities*/
pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
/*SR-IOV fun dep link*/
pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)
iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link);
/*这个和上面的for循环链表有关系,?*/
if (pdev)
iov->dev = pci_dev_get(pdev);
else
iov->dev = dev;
/*将iov赋值给dev->sriov,这个dev代表当前的pcie设备也就是pf*/
dev->sriov = iov;
/*声明为物理设备(PF)*/
dev->is_physfn = 1;
/*主要是给iov->max_VF_buses进行赋值*/
rc = compute_max_vf_buses(dev);
if (rc)
goto fail_max_buses;
return 0;
fail_max_buses:
dev->sriov = NULL;
dev->is_physfn = 0;
failed:
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
res->flags = 0;
}
kfree(iov);
return rc;
}
int pci_iov_init(struct pci_dev *dev)
{
int pos;
/*判断是否是pcie设备,pcie设备有pcie功能,
pci设备无sriov功能*/
if (!pci_is_pcie(dev))
return -ENODEV;
/*判断这款pcie设备是否有sriov feature*/
pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
/*有的话就调用sriov_init函数做vf的初始化,注意这里的返回值,
(pos)返回的是SRIOV扩展能力空间的起始地址*/
if (pos)
return sriov_init(dev, pos);
return -ENODEV;
}
/******************************************************************************************/
void pci_iov_update_resource(struct pci_dev *dev, int resno)
{
struct pci_sriov *iov = dev->is_physfn ? dev->sriov : NULL;
struct resource *res = dev->resource + resno;
int vf_bar = resno - PCI_IOV_RESOURCES;
struct pci_bus_region region;
u16 cmd;
u32 new;
int reg;
if (!iov)
return;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &cmd);
if ((cmd & PCI_SRIOV_CTRL_VFE) && (cmd & PCI_SRIOV_CTRL_MSE)) {
dev_WARN(&dev->dev, "can't update enabled VF BAR%d %pR\n",
vf_bar, res);
return;
}
if (!res->flags)
return;
if (res->flags & IORESOURCE_UNSET)
return;
if (res->flags & IORESOURCE_PCI_FIXED)
return;
pcibios_resource_to_bus(dev->bus, ®ion, res);
new = region.start;
new |= res->flags & ~PCI_BASE_ADDRESS_MEM_MASK;
reg = iov->pos + PCI_SRIOV_BAR + 4 * vf_bar;
pci_write_config_dword(dev, reg, new);
if (res->flags & IORESOURCE_MEM_64) {
new = region.start >> 16 >> 16;
pci_write_config_dword(dev, reg + 4, new);
}
}
/****************************************************************************/
static void sriov_restore_state(struct pci_dev *dev)
{
int i;
u16 ctrl;
struct pci_sriov *iov = dev->sriov;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
if (ctrl & PCI_SRIOV_CTRL_VFE)
return;
ctrl &= ~PCI_SRIOV_CTRL_ARI;
ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI;
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
pci_update_resource(dev, i + PCI_IOV_RESOURCES);
pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
pci_iov_set_numvfs(dev, iov->num_VFs);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
msleep(100);
}
void pci_restore_iov_state(struct pci_dev *dev)
{
if (dev->is_physfn)
sriov_restore_state(dev);
}
/**********************************************************************************/
void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool auto_probe)
{
if (dev->is_physfn)
dev->sriov->drivers_autoprobe = auto_probe;
}
static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct pci_dev *pdev = to_pci_dev(dev);
bool drivers_autoprobe;
if (kstrtobool(buf, &drivers_autoprobe) < 0)
return -EINVAL;
pdev->sriov->drivers_autoprobe = drivers_autoprobe;
return count;
}
/**************************************************************************/
int pci_iov_bus_range(struct pci_bus *bus)
{
int max = 0;
struct pci_dev *dev;
/*寻找总线资源*/
list_for_each_entry(dev, &bus->devices, bus_list) {
if (!dev->is_physfn)
continue;
if (dev->sriov->max_VF_buses > max)
max = dev->sriov->max_VF_buses;
}
return max ? max - bus->number : 0;
}
/***************************************************************************************/
/*这里为什么返回的是driver_max_VFs而不是totalvfs*/
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
{
if (!dev->is_physfn)
return -ENOSYS;
if (numvfs > dev->sriov->total_VFs)
return -EINVAL;
/* Shouldn't change if VFs already enabled */
if (dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
return -EBUSY;
dev->sriov->driver_max_VFs = numvfs;
return 0;
}
EXPORT_SYMBOL_GPL(pci_sriov_set_totalvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev)
{
if (!dev->is_physfn)
return 0;
return dev->sriov->driver_max_VFs;
}
EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
int pci_num_vf(struct pci_dev *dev)
{
if (!dev->is_physfn)
return 0;
return dev->sriov->num_VFs;
}
EXPORT_SYMBOL_GPL(pci_num_vf);
/**************************************************************************************/
int pci_vfs_assigned(struct pci_dev *dev)
{
struct pci_dev *vfdev;
unsigned int vfs_assigned = 0;
unsigned short dev_id;
/* only search if we are a PF */
if (!dev->is_physfn)
return 0;
dev_id = dev->sriov->vf_device;
/* loop through all the VFs to see if we own any that are assigned */
vfdev = pci_get_device(dev->vendor, dev_id, NULL);
while (vfdev) {
if (vfdev->is_virtfn && (vfdev->physfn == dev) &&
pci_is_dev_assigned(vfdev))
vfs_assigned++;
vfdev = pci_get_device(dev->vendor, dev_id, vfdev);
}
return vfs_assigned;
}
EXPORT_SYMBOL_GPL(pci_vfs_assigned);
static int sriov_add_vfs(struct pci_dev *dev, u16 num_vfs)
{
unsigned int i;
int rc;
if (dev->no_vf_scan)
return 0;
/*逐个增加vf*/
for (i = 0; i < num_vfs; i++) {
rc = pci_iov_add_virtfn(dev, i);
if (rc)
goto failed;
}
return 0;
failed:
while (i--)
pci_iov_remove_virtfn(dev, i);
return rc;
}
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
{
struct pci_bus *child;
if (bus->number == busnr)
return bus;
child = pci_find_bus(pci_domain_nr(bus), busnr);
if (child)
return child;
child = pci_add_new_bus(bus, NULL, busnr);
if (!child)
return NULL;
pci_bus_insert_busn_res(child, busnr, busnr);
return child;
}
static void pci_read_vf_config_common(struct pci_dev *virtfn)
{
struct pci_dev *physfn = virtfn->physfn;
pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
&physfn->sriov->class);
pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
&physfn->sriov->hdr_type);
pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
&physfn->sriov->subsystem_vendor);
pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
&physfn->sriov->subsystem_device);
}
int pci_iov_sysfs_link(struct pci_dev *dev,
struct pci_dev *virtfn, int id)
{
char buf[VIRTFN_ID_LEN];
int rc;
sprintf(buf, "virtfn%u", id);
rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
if (rc)
goto failed;
rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
if (rc)
goto failed1;
kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
return 0;
failed1:
sysfs_remove_link(&dev->dev.kobj, buf);
failed:
return rc;
}
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
int i;
int rc = -ENOMEM;
u64 size;
struct pci_dev *virtfn;
struct resource *res;
struct pci_sriov *iov = dev->sriov;
struct pci_bus *bus;
/*虚拟总线bus*/
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus)
goto failed;
/*从总线当中申请一个pci设备*/
virtfn = pci_alloc_dev(bus);
if (!virtfn)
goto failed0;
/*DF赋值*/
virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
/*vendor id赋值*/
virtfn->vendor = dev->vendor;
/*device id赋值*/
virtfn->device = iov->vf_device;
/*声明为虚拟设备(VF)*/
virtfn->is_virtfn = 1;
/*vf 相关pf*/
virtfn->physfn = pci_dev_get(dev);
/*??*/
virtfn->no_command_memory = 1;
/*其它成员的赋值,不过为什么是只有id为0时进来?*/
if (id == 0)
pci_read_vf_config_common(virtfn);
/*对这个pf(把vf的一些值赋给pf的成员变量)设备进行一些设置,
例如中断设置,以及这个假的vf设备(从结构体本身而言是pf设备)
resource变量(flag)赋值等等*/
rc = pci_setup_device(virtfn);
if (rc)
goto failed1;
/*设备父节点赋值,从赋值情况来看
vf设备的父节点设备和他关联的pf设备是一样的*/
virtfn->dev.parent = dev->dev.parent;
/*单功能设备*/
virtfn->multifunction = 0;
/*设备的bar资源从初始化的时候那里拿过来依次分配*/
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
/*从pci部分开始的赋值*/
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (!res->parent)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
virtfn->resource[i].start = res->start + size * id;
virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
rc = request_resource(res, &virtfn->resource[i]);
BUG_ON(rc);
}
/*先追加一个设备,再在总线上对这个设备进行追加*/
pci_device_add(virtfn, virtfn->bus);
/**/
rc = pci_iov_sysfs_link(dev, virtfn, id);
if (rc)
goto failed1;
/*这个时候可以在总线上追加一个设备了,
最后追加的其实是一个pf设备,只不过把这个pf
设备当成vf设备使用了*/
pci_bus_add_device(virtfn);
return 0;
failed1:
pci_stop_and_remove_bus_device(virtfn);
pci_dev_put(dev);
failed0:
virtfn_remove_bus(dev->bus, bus);
failed:
return rc;
}
总结:这部分代码量比较庞大,涉及的东西比较多。通过阅读,要真正的掌握这部分代码,我的感受有以下几点。
1:由于在enable sriov时需要指定num_vfs,这里在添加一个vf设备时涉及的过程和pci设备枚举是一样的,由于自己对这部分不是非常的熟悉,所以这里有些部分的代码无法吃透。可能后续我会先去阅读pcie枚举部分的代码,在回来看sriov
2:在阅读时,大部分操作涉及寄存器的赋值,这里推荐以下这个网址
https://www.intel.cn/content/www/cn/zh/docs/programmable/683111/17-1/initial-vfs-and-total-vfs-registers.html
3:有些细节问题需要多加打印语句才能理解,在阅读这部分代码时我没有调试,这是我后续需要改进的地方。
更多推荐
所有评论(0)