Skip to content

Commit 4200f2c

Browse files
jiayu.chenjoy717
jiayu.chen
authored andcommitted
fix pod bind wrong card when there are other gpus
Signed-off-by: joy717 <joy717.xmu@gmail.com>
1 parent 729d824 commit 4200f2c

File tree

4 files changed

+117
-5
lines changed

4 files changed

+117
-5
lines changed

internal/pkg/dcu/server.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ func (p *Plugin) createvdevFiles(current *corev1.Pod, ctr *corev1.Container, req
378378
}
379379
dirName := string(current.UID) + "_" + ctr.Name + "_" + fmt.Sprint(devidx) + "_" + fmt.Sprint(pipeid) + "_" + fmt.Sprint(vdevidx) + "_" + fmt.Sprint(coremsk1) + "_" + fmt.Sprint(coremsk2)
380380
cacheFileHostDirectory := fmt.Sprintf("/usr/local/vgpu/dcu/%s", dirName)
381-
err = createvdevFile(pcibusId, coremsk1, coremsk2, reqcores, mem, 0, vdevidx, pipeid, cacheFileHostDirectory, "vdev0.conf")
381+
err = createvdevFile(pcibusId, coremsk1, coremsk2, reqcores, mem, devidx, vdevidx, pipeid, cacheFileHostDirectory, fmt.Sprintf("vdev%d.conf", vdevidx))
382382
if err != nil {
383383
return "", err
384384
}
@@ -445,6 +445,11 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
445445
nodelock.ReleaseNodeLock(nodename, NodeLockDCU)
446446
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
447447
}
448+
drmCards, drmRenders, err := util.ListDcuDrmDevices()
449+
if err != nil {
450+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
451+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
452+
}
448453
for idx := range reqs.ContainerRequests {
449454
currentCtr, devreq, err := util.GetNextDeviceRequest(util.HygonDCUDevice, *current)
450455
klog.Infoln("deviceAllocateFromAnnotation=", devreq)
@@ -479,18 +484,33 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
479484
car.Devices = append(car.Devices, dev)
480485

481486
for _, val := range devreq {
482-
var id int
487+
var devIdx = -1
483488
klog.Infof("Allocating device ID: %s", val.UUID)
484-
fmt.Sscanf(val.UUID, "DCU-%d", &id)
489+
succeedCount, err := fmt.Sscanf(val.UUID, "DCU-%d", &devIdx)
490+
if err != nil || succeedCount == 0 || devIdx == -1 {
491+
klog.Errorf("Invalid request device uuid: %s", val.UUID)
492+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
493+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("invalid request device uuid %s", val.UUID)
494+
}
495+
496+
if devIdx > len(drmCards) || devIdx > len(drmRenders) {
497+
klog.Errorf("Invalid device index: %d, all devices counts is: %d, all renders count is: %d", devIdx, len(drmCards), len(drmRenders))
498+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
499+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("can not match dcu dri request %s. cards %d, renders %d", val.UUID, len(drmCards), len(drmRenders))
500+
}
485501

486-
devpath := fmt.Sprintf("/dev/dri/card%d", id)
502+
drmCardName := drmCards[devIdx]
503+
klog.Infof("All dcu dri card devs: %v, mapped dri: %s", drmCards, drmCardName)
504+
devpath := fmt.Sprintf("/dev/dri/%s", drmCardName)
487505
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
488506
dev.HostPath = devpath
489507
dev.ContainerPath = devpath
490508
dev.Permissions = "rw"
491509
car.Devices = append(car.Devices, dev)
492510

493-
devpath = fmt.Sprintf("/dev/dri/renderD%d", (id + 128))
511+
drmRenderName := drmRenders[devIdx]
512+
klog.Infof("All dcu dri render devs: %v, mapped dri: %s", drmRenders, drmRenderName)
513+
devpath = fmt.Sprintf("/dev/dri/%s", drmRenderName)
494514
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
495515
dev.HostPath = devpath
496516
dev.ContainerPath = devpath

internal/pkg/util/drm_slice.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package util
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
)
7+
8+
// DrmSlice is a struct for sort to natural sorting, we assume that the element is a 'card%d'/'renderD%d' format.
9+
// Note: if you want a common natural sorting, you should NOT use this one.
10+
type DrmSlice []string
11+
12+
func (p DrmSlice) Len() int { return len(p) }
13+
func (p DrmSlice) Less(i, j int) bool { return DrmSortLess(p[i], p[j]) }
14+
func (p DrmSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
15+
16+
func DrmSortLess(a, b string) bool {
17+
cardA := strings.HasPrefix(a, DriPrefixCard)
18+
cardB := strings.HasPrefix(b, DriPrefixCard)
19+
if cardA && cardB {
20+
var aid, bid int
21+
fmt.Sscanf(a, "card%d", &aid)
22+
fmt.Sscanf(b, "card%d", &bid)
23+
return aid <= bid
24+
}
25+
renderA := strings.HasPrefix(a, DriPrefixRender)
26+
renderB := strings.HasPrefix(b, DriPrefixRender)
27+
if renderA && renderB {
28+
var aid, bid int
29+
fmt.Sscanf(a, "renderD%d", &aid)
30+
fmt.Sscanf(b, "renderD%d", &bid)
31+
return aid <= bid
32+
}
33+
return a <= b
34+
}

internal/pkg/util/types.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,17 @@ const (
4242

4343
// NodeNameEnvName define env var name for use get node name.
4444
NodeNameEnvName = "NODE_NAME"
45+
46+
//HygonVendorID is the pci vendor id for hygon
47+
HygonVendorID = "0x1d94"
48+
49+
// LocationDri is the location of the dri
50+
LocationDri = "/dev/dri"
51+
52+
//DriPrefixCard is the "card" file name prefix in /dev/dri. eg: /dev/dri/card0
53+
DriPrefixCard = "card"
54+
//DriPrefixRender is the "render" file name prefix in /dev/dri. eg: /dev/dri/renderD128
55+
DriPrefixRender = "renderD"
4556
)
4657

4758
type DevicePluginConfigs struct {

internal/pkg/util/util.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"errors"
2323
"flag"
2424
"fmt"
25+
"os"
26+
"sort"
2527
"strconv"
2628
"strings"
2729
"time"
@@ -422,3 +424,48 @@ func PodAllocationFailed(nodeName string, pod *corev1.Pod, lockname string) {
422424
klog.Errorf("release lock failed:%v", err.Error())
423425
}
424426
}
427+
428+
// ListDcuDrmDevices list all drm devices, and filter it by dcu vendorID
429+
func ListDcuDrmDevices() ([]string, []string, error) {
430+
filenames := make([]string, 0)
431+
dcuDrms := make([]string, 0)
432+
dcuRenders := make([]string, 0)
433+
files, err := os.ReadDir(LocationDri)
434+
if err != nil {
435+
klog.Errorf("ListDcuDrmDevices list dri directory error: %v", err.Error())
436+
return nil, nil, err
437+
}
438+
for _, f := range files {
439+
if f.IsDir() {
440+
continue
441+
}
442+
if strings.HasPrefix(f.Name(), DriPrefixCard) || strings.HasPrefix(f.Name(), DriPrefixRender) {
443+
filenames = append(filenames, f.Name())
444+
}
445+
}
446+
// we need the devices order correctly, sort it first
447+
sort.Sort(DrmSlice(filenames))
448+
for _, f := range filenames {
449+
vendorID, err := os.ReadFile(fmt.Sprintf("/sys/class/drm/%s/device/vendor", f))
450+
if err != nil {
451+
klog.Errorf("ListDcuDrmDevices read vendor file error: %v", err.Error())
452+
return nil, nil, err
453+
}
454+
fixedVendorID := strings.TrimSpace(string(vendorID))
455+
if fixedVendorID != HygonVendorID {
456+
klog.Infof("ListDcuDrmDevices dri dev %s vendorID %s is not dcu, skip it", f, fixedVendorID)
457+
continue
458+
}
459+
if strings.HasPrefix(f, DriPrefixCard) {
460+
dcuDrms = append(dcuDrms, f)
461+
}
462+
if strings.HasPrefix(f, DriPrefixRender) {
463+
dcuRenders = append(dcuRenders, f)
464+
}
465+
}
466+
if len(dcuDrms) != len(dcuRenders) {
467+
return nil, nil, fmt.Errorf("ListDcuDrmDevices dcuDrms %v and dcuRenders %v length not equal", dcuDrms, dcuRenders)
468+
}
469+
470+
return dcuDrms, dcuRenders, nil
471+
}

0 commit comments

Comments
 (0)