Skip to content

Commit 1a98838

Browse files
committed
Manually create udev links if udev trigger doesn't work
Change-Id: I26bffb3eda447c8343ce39e69c34cf31616120e3
1 parent 4c5bb0b commit 1a98838

File tree

5 files changed

+162
-18
lines changed

5 files changed

+162
-18
lines changed

pkg/deviceutils/device-utils.go

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ func (m *deviceUtils) VerifyDevicePath(devicePaths []string, deviceName string)
290290
})
291291

292292
if err != nil {
293-
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %w", deviceName, pollTimeout, err)
293+
klog.Warningf("For device %s udevadmin failed: %v. Trying to manually link", deviceName, err)
294+
if err := manuallySetDevicePath(deviceName); err != nil {
295+
return "", fmt.Errorf("failed to manually set link for disk %s: %w", deviceName, err)
296+
}
294297
}
295298

296299
return devicePath, nil
@@ -338,11 +341,11 @@ func findAvailableDevFsPaths() ([]string, error) {
338341
return append(diskSDPaths, diskNvmePaths...), nil
339342
}
340343

341-
func udevadmTriggerForDiskIfExists(deviceName string) error {
344+
func findDevice(deviceName string) (string, string, error) {
342345
devFsPathToSerial := map[string]string{}
343346
devFsPaths, err := findAvailableDevFsPaths()
344347
if err != nil {
345-
return err
348+
return "", "", err
346349
}
347350
for _, devFsPath := range devFsPaths {
348351
devFsSerial, err := getDevFsSerial(devFsPath)
@@ -355,17 +358,33 @@ func udevadmTriggerForDiskIfExists(deviceName string) error {
355358
klog.V(4).Infof("device path %s, serial number %v", devFsPath, devFsSerial)
356359
devFsPathToSerial[devFsPath] = devFsSerial
357360
if devFsSerial == deviceName {
358-
// Found the disk that we're looking for so run a trigger on it
359-
// to resolve its /dev/by-id/ path
360-
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
361-
err := udevadmChangeToDrive(devFsPath)
362-
if err != nil {
363-
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
364-
}
365-
return nil
361+
return devFsPath, devFsSerial, nil
366362
}
367363
}
368-
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
364+
return "", "", fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
365+
}
366+
367+
func manuallySetDevicePath(deviceName string) error {
368+
devFsPath, devFsSerial, err := findDevice(deviceName)
369+
if err != nil {
370+
return err
371+
}
372+
return os.Symlink(devFsPath, path.Join(diskByIdPath, diskGooglePrefix+devFsSerial))
373+
}
374+
375+
func udevadmTriggerForDiskIfExists(deviceName string) error {
376+
devFsPath, devFsSerial, err := findDevice(deviceName)
377+
if err != nil {
378+
return err
379+
}
380+
// Found the disk that we're looking for so run a trigger on it
381+
// to resolve its /dev/by-id/ path
382+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
383+
err = udevadmChangeToDrive(devFsPath)
384+
if err != nil {
385+
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
386+
}
387+
return nil
369388
}
370389

371390
// Calls "udevadm trigger --action=change" on the specified drive. drivePath

test/e2e/tests/setup_e2e_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ var (
4343
serviceAccount = flag.String("service-account", "", "Service account to bring up instance with")
4444
vmNamePrefix = flag.String("vm-name-prefix", "gce-pd-csi-e2e", "VM name prefix")
4545
architecture = flag.String("arch", "amd64", "Architecture pd csi driver build on")
46-
minCpuPlatform = flag.String("min-cpu-platform", "rome", "Minimum CPU architecture")
47-
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "sapphirerapids", "Minimum CPU architecture for multiwriter tests")
46+
minCpuPlatform = flag.String("min-cpu-platform", "AMD Rome", "Minimum CPU architecture")
47+
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "Intel Sapphire Rapids", "Minimum CPU architecture for multiwriter tests")
4848
zones = flag.String("zones", "us-east4-a,us-east4-c", "Zones to run tests in. If there are multiple zones, separate each by comma")
4949
machineType = flag.String("machine-type", "n2d-standard-4", "Type of machine to provision instance on")
5050
imageURL = flag.String("image-url", "projects/ubuntu-os-cloud/global/images/family/ubuntu-minimal-2404-lts-amd64", "OS image url to get image from")
@@ -120,8 +120,8 @@ var _ = BeforeSuite(func() {
120120
tcc <- NewDefaultTestContext(curZone, strconv.Itoa(randInt))
121121
}(zone, j)
122122
}
123-
wg.Add(1)
124123
go func(curZone string) {
124+
wg.Add(1)
125125
defer GinkgoRecover()
126126
defer wg.Done()
127127
hdtcc <- NewTestContext(curZone, *hdMinCpuPlatform, *hdMachineType, "0")

test/e2e/tests/single_zone_e2e_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,6 +1665,86 @@ var _ = Describe("GCE PD CSI Driver", func() {
16651665
Expect(err).To(BeNil(), "Failed to rm file path %s: %v", fp, err)
16661666
})
16671667

1668+
It("Should mount if udev disabled, and remount if it's enabled again", func() {
1669+
testContext := getRandomTestContext()
1670+
p, z, _ := testContext.Instance.GetIdentity()
1671+
client := testContext.Client
1672+
instance := testContext.Instance
1673+
1674+
err := instance.DisableUdev()
1675+
Expect(err).To(BeNil(), "Failed to disable udev")
1676+
1677+
// Create Disk
1678+
volName, volID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1679+
vol2Name, vol2ID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1680+
1681+
defer func() {
1682+
// Delete Disks
1683+
err := client.DeleteVolume(volID)
1684+
Expect(err).To(BeNil(), "DeleteVolume failed")
1685+
1686+
err = client.DeleteVolume(vol2ID)
1687+
Expect(err).To(BeNil(), "DeleteVolume failed")
1688+
1689+
// Validate Disks Deleted
1690+
_, err = computeService.Disks.Get(p, z, volName).Do()
1691+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1692+
_, err = computeService.Disks.Get(p, z, vol2Name).Do()
1693+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1694+
}()
1695+
1696+
// Attach & detach disk. We retry as we expect the udev repair to take a little bit of time.
1697+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1698+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1699+
if err != nil {
1700+
klog.Infof("initial udev error, retrying: %v", err)
1701+
}
1702+
return err == nil, nil
1703+
})
1704+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")
1705+
1706+
// Attach a different disk. The conflicting udev paths should not cause a problem.
1707+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1708+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1709+
if err != nil {
1710+
klog.Infof("second disk udev error, retrying: %v", err)
1711+
}
1712+
return err == nil, nil
1713+
})
1714+
Expect(err).To(BeNil(), "Failed to go through second volume lifecycle")
1715+
1716+
// Attach, reenable udev, go through lifecycle of second disk, detach first
1717+
var detacher func()
1718+
var args *verifyArgs
1719+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1720+
err, detacher, args = testAttachAndMount(volID, volName, instance, client, attachAndMountArgs{})
1721+
if err != nil {
1722+
klog.Infof("attach before reenable failed, retrying: %v", err)
1723+
}
1724+
return err == nil, nil
1725+
})
1726+
Expect(err).To(BeNil(), "Failed second attach")
1727+
defer detacher()
1728+
1729+
err = instance.EnableUdev()
1730+
Expect(err).To(BeNil(), "Failed to enable udev")
1731+
1732+
// After udev is enabled we expect everything to succeed on the first try.
1733+
1734+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1735+
Expect(err).To(BeNil(), "Failed to go through nested volume lifecycle with enabled")
1736+
1737+
err = client.NodeUnpublishVolume(volID, args.publishDir)
1738+
Expect(err).To(BeNil(), "Failed to unpublish first")
1739+
1740+
err = client.NodeUnstageVolume(volID, args.stageDir)
1741+
Expect(err).To(BeNil(), "Failed to unstage first")
1742+
1743+
// Go through complete lifecycle again, with udev enabled.
1744+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1745+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle with udev enabled")
1746+
})
1747+
16681748
type multiZoneTestConfig struct {
16691749
diskType string
16701750
readOnly bool

test/remote/instance.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
9292
// Ideally we could compare to see if the new instance has a greater minCpuPlatfor
9393
// For now we just check it was set and it's different.
9494
if curInst.MinCpuPlatform != "" && curInst.MinCpuPlatform != newInst.MinCpuPlatform {
95-
klog.Infof("CPU Platform mismatch")
95+
klog.Infof("CPU Platform mismatch: cur: %v; new: %v", curInst.MinCpuPlatform, newInst.MinCpuPlatform)
9696
return true
9797
}
9898
if (curInst.ConfidentialInstanceConfig != nil && newInst.ConfidentialInstanceConfig == nil) ||
@@ -102,7 +102,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
102102
return true
103103
}
104104
if curInst.SourceMachineImage != newInst.SourceMachineImage {
105-
klog.Infof("Source Machine Mismatch")
105+
klog.Infof("Source Machine Mismatch: cur: %v; new: %v", curInst.SourceMachineImage, newInst.SourceMachineImage)
106106
return true
107107
}
108108
return false
@@ -121,6 +121,7 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
121121
return fmt.Errorf("Failed to create firewall rule: %v", err.Error())
122122
}
123123

124+
region := i.cfg.Zone[:len(i.cfg.Zone)-2]
124125
newInst := &compute.Instance{
125126
Name: i.cfg.Name,
126127
MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", i.cfg.Zone, i.cfg.MachineType),
@@ -131,7 +132,9 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
131132
Type: "ONE_TO_ONE_NAT",
132133
Name: "External NAT",
133134
},
134-
}},
135+
},
136+
Subnetwork: fmt.Sprintf("regions/%s/subnetworks/default", region),
137+
},
135138
},
136139
Disks: []*compute.AttachedDisk{
137140
{

test/remote/ssh.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,48 @@ func (i *InstanceInfo) SSHCheckAlive() error {
100100
})
101101
}
102102

103+
func (i *InstanceInfo) DisableUdev() error {
104+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
105+
_, err := i.SSH("systemctl", "stop", "systemd-udevd")
106+
if err != nil {
107+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd: %v", err)
108+
return false, nil
109+
}
110+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-kernel.socket")
111+
if err != nil {
112+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-kernel.socket: %v", err)
113+
return false, nil
114+
}
115+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-control.socket")
116+
if err != nil {
117+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-control.socket: %v", err)
118+
return false, nil
119+
}
120+
return true, nil
121+
})
122+
}
123+
124+
func (i *InstanceInfo) EnableUdev() error {
125+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
126+
_, err := i.SSH("systemctl", "start", "systemd-udevd")
127+
if err != nil {
128+
klog.V(2).Infof("(will retry) failed to start systemd-udevd: %v", err)
129+
return false, nil
130+
}
131+
_, err = i.SSH("systemctl", "start", "systemd-udevd-kernel.socket")
132+
if err != nil {
133+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-kernel.socket: %v", err)
134+
return false, nil
135+
}
136+
_, err = i.SSH("systemctl", "start", "systemd-udevd-control.socket")
137+
if err != nil {
138+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-control.socket: %v", err)
139+
return false, nil
140+
}
141+
return true, nil
142+
})
143+
}
144+
103145
// runSSHCommand executes the ssh or scp command, adding the flag provided --ssh-options
104146
func runSSHCommand(cmd string, args ...string) (string, error) {
105147
if pk, ok := os.LookupEnv("JENKINS_GCE_SSH_PRIVATE_KEY_FILE"); ok {

0 commit comments

Comments
 (0)