Skip to content

Commit a42de85

Browse files
committed
Manually create udev links if udev trigger doesn't work
Change-Id: I26bffb3eda447c8343ce39e69c34cf31616120e3
1 parent 24692d0 commit a42de85

File tree

5 files changed

+167
-17
lines changed

5 files changed

+167
-17
lines changed

pkg/deviceutils/device-utils.go

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ func (m *deviceUtils) VerifyDevicePath(devicePaths []string, deviceName string)
290290
})
291291

292292
if err != nil {
293-
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %w", deviceName, pollTimeout, err)
293+
klog.Warningf("For device %s udevadmin failed: %v. Trying to manually link", deviceName, err)
294+
if err := manuallySetDevicePath(deviceName); err != nil {
295+
return "", fmt.Errorf("failed to manually set link for disk %s: %w", deviceName, err)
296+
}
294297
}
295298

296299
return devicePath, nil
@@ -338,11 +341,11 @@ func findAvailableDevFsPaths() ([]string, error) {
338341
return append(diskSDPaths, diskNvmePaths...), nil
339342
}
340343

341-
func udevadmTriggerForDiskIfExists(deviceName string) error {
344+
func findDevice(deviceName string) (string, string, error) {
342345
devFsPathToSerial := map[string]string{}
343346
devFsPaths, err := findAvailableDevFsPaths()
344347
if err != nil {
345-
return err
348+
return "", "", err
346349
}
347350
for _, devFsPath := range devFsPaths {
348351
devFsSerial, err := getDevFsSerial(devFsPath)
@@ -355,17 +358,33 @@ func udevadmTriggerForDiskIfExists(deviceName string) error {
355358
klog.V(4).Infof("device path %s, serial number %v", devFsPath, devFsSerial)
356359
devFsPathToSerial[devFsPath] = devFsSerial
357360
if devFsSerial == deviceName {
358-
// Found the disk that we're looking for so run a trigger on it
359-
// to resolve its /dev/by-id/ path
360-
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
361-
err := udevadmChangeToDrive(devFsPath)
362-
if err != nil {
363-
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
364-
}
365-
return nil
361+
return devFsPath, devFsSerial, nil
366362
}
367363
}
368-
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
364+
return "", "", fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in device path %v", deviceName, devFsPathToSerial)
365+
}
366+
367+
func manuallySetDevicePath(deviceName string) error {
368+
devFsPath, devFsSerial, err := findDevice(deviceName)
369+
if err != nil {
370+
return err
371+
}
372+
return os.Symlink(devFsPath, path.Join(diskByIdPath, diskGooglePrefix+devFsSerial))
373+
}
374+
375+
func udevadmTriggerForDiskIfExists(deviceName string) error {
376+
devFsPath, devFsSerial, err := findDevice(deviceName)
377+
if err != nil {
378+
return err
379+
}
380+
// Found the disk that we're looking for so run a trigger on it
381+
// to resolve its /dev/by-id/ path
382+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has serial number %s", devFsPath, devFsSerial)
383+
err = udevadmChangeToDrive(devFsPath)
384+
if err != nil {
385+
return fmt.Errorf("udevadm --trigger failed to fix device path %s which has serial number %s: %w", devFsPath, devFsSerial, err)
386+
}
387+
return nil
369388
}
370389

371390
// Calls "udevadm trigger --action=change" on the specified drive. drivePath

test/e2e/tests/setup_e2e_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ var (
4747
serviceAccount = flag.String("service-account", "", "Service account to bring up instance with")
4848
vmNamePrefix = flag.String("vm-name-prefix", "gce-pd-csi-e2e", "VM name prefix")
4949
architecture = flag.String("arch", "amd64", "Architecture pd csi driver build on")
50-
minCpuPlatform = flag.String("min-cpu-platform", "rome", "Minimum CPU architecture")
51-
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "sapphirerapids", "Minimum CPU architecture for multiwriter tests")
50+
minCpuPlatform = flag.String("min-cpu-platform", "AMD Rome", "Minimum CPU architecture")
51+
mwMinCpuPlatform = flag.String("min-cpu-platform-mw", "Intel Sapphire Rapids", "Minimum CPU architecture for multiwriter tests")
5252
zones = flag.String("zones", "us-east4-a,us-east4-c", "Zones to run tests in. If there are multiple zones, separate each by comma")
5353
machineType = flag.String("machine-type", "n2d-standard-4", "Type of machine to provision instance on")
5454
imageURL = flag.String("image-url", "projects/ubuntu-os-cloud/global/images/family/ubuntu-minimal-2404-lts-amd64", "OS image url to get image from")
@@ -135,6 +135,12 @@ var _ = BeforeSuite(func() {
135135
hdtcc <- NewTestContext(curZone, *hdMinCpuPlatform, *hdMachineType, "0")
136136
}(zone)
137137
}
138+
go func(curZone string) {
139+
wg.Add(1)
140+
defer GinkgoRecover()
141+
defer wg.Done()
142+
hdtcc <- NewTestContext(curZone, *hdMinCpuPlatform, *hdMachineType, "0")
143+
}(zone)
138144
wg.Wait()
139145
}
140146

test/e2e/tests/single_zone_e2e_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,6 +1668,86 @@ var _ = Describe("GCE PD CSI Driver", func() {
16681668
Expect(err).To(BeNil(), "Failed to rm file path %s: %v", fp, err)
16691669
})
16701670

1671+
It("Should mount if udev disabled, and remount if it's enabled again", func() {
1672+
testContext := getRandomTestContext()
1673+
p, z, _ := testContext.Instance.GetIdentity()
1674+
client := testContext.Client
1675+
instance := testContext.Instance
1676+
1677+
err := instance.DisableUdev()
1678+
Expect(err).To(BeNil(), "Failed to disable udev")
1679+
1680+
// Create Disk
1681+
volName, volID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1682+
vol2Name, vol2ID := createAndValidateUniqueZonalDisk(client, p, z, standardDiskType)
1683+
1684+
defer func() {
1685+
// Delete Disks
1686+
err := client.DeleteVolume(volID)
1687+
Expect(err).To(BeNil(), "DeleteVolume failed")
1688+
1689+
err = client.DeleteVolume(vol2ID)
1690+
Expect(err).To(BeNil(), "DeleteVolume failed")
1691+
1692+
// Validate Disks Deleted
1693+
_, err = computeService.Disks.Get(p, z, volName).Do()
1694+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1695+
_, err = computeService.Disks.Get(p, z, vol2Name).Do()
1696+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
1697+
}()
1698+
1699+
// Attach & detach disk. We retry as we expect the udev repair to take a little bit of time.
1700+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1701+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1702+
if err != nil {
1703+
klog.Infof("initial udev error, retrying: %v", err)
1704+
}
1705+
return err == nil, nil
1706+
})
1707+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")
1708+
1709+
// Attach a different disk. The conflicting udev paths should not cause a problem.
1710+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1711+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1712+
if err != nil {
1713+
klog.Infof("second disk udev error, retrying: %v", err)
1714+
}
1715+
return err == nil, nil
1716+
})
1717+
Expect(err).To(BeNil(), "Failed to go through second volume lifecycle")
1718+
1719+
// Attach, reenable udev, go through lifecycle of second disk, detach first
1720+
var detacher func()
1721+
var args *verifyArgs
1722+
err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
1723+
err, detacher, args = testAttachAndMount(volID, volName, instance, client, attachAndMountArgs{})
1724+
if err != nil {
1725+
klog.Infof("attach before reenable failed, retrying: %v", err)
1726+
}
1727+
return err == nil, nil
1728+
})
1729+
Expect(err).To(BeNil(), "Failed second attach")
1730+
defer detacher()
1731+
1732+
err = instance.EnableUdev()
1733+
Expect(err).To(BeNil(), "Failed to enable udev")
1734+
1735+
// After udev is enabled we expect everything to succeed on the first try.
1736+
1737+
err = testAttachWriteReadDetach(vol2ID, vol2Name, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1738+
Expect(err).To(BeNil(), "Failed to go through nested volume lifecycle with enabled")
1739+
1740+
err = client.NodeUnpublishVolume(volID, args.publishDir)
1741+
Expect(err).To(BeNil(), "Failed to unpublish first")
1742+
1743+
err = client.NodeUnstageVolume(volID, args.stageDir)
1744+
Expect(err).To(BeNil(), "Failed to unstage first")
1745+
1746+
// Go through complete lifecycle again, with udev enabled.
1747+
err = testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */, false /* detachAndReattach */, false /* setupDataCache */)
1748+
Expect(err).To(BeNil(), "Failed to go through volume lifecycle with udev enabled")
1749+
})
1750+
16711751
type multiZoneTestConfig struct {
16721752
diskType string
16731753
readOnly bool

test/remote/instance.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
9292
// Ideally we could compare to see if the new instance has a greater minCpuPlatfor
9393
// For now we just check it was set and it's different.
9494
if curInst.MinCpuPlatform != "" && curInst.MinCpuPlatform != newInst.MinCpuPlatform {
95-
klog.Infof("CPU Platform mismatch")
95+
klog.Infof("CPU Platform mismatch: cur: %v; new: %v", curInst.MinCpuPlatform, newInst.MinCpuPlatform)
9696
return true
9797
}
9898
if (curInst.ConfidentialInstanceConfig != nil && newInst.ConfidentialInstanceConfig == nil) ||
@@ -102,7 +102,7 @@ func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) b
102102
return true
103103
}
104104
if curInst.SourceMachineImage != newInst.SourceMachineImage {
105-
klog.Infof("Source Machine Mismatch")
105+
klog.Infof("Source Machine Mismatch: cur: %v; new: %v", curInst.SourceMachineImage, newInst.SourceMachineImage)
106106
return true
107107
}
108108
return false
@@ -121,6 +121,7 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
121121
return fmt.Errorf("Failed to create firewall rule: %v", err.Error())
122122
}
123123

124+
region := i.cfg.Zone[:len(i.cfg.Zone)-2]
124125
newInst := &compute.Instance{
125126
Name: i.cfg.Name,
126127
MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", i.cfg.Zone, i.cfg.MachineType),
@@ -131,7 +132,9 @@ func (i *InstanceInfo) CreateOrGetInstance(localSSDCount int) error {
131132
Type: "ONE_TO_ONE_NAT",
132133
Name: "External NAT",
133134
},
134-
}},
135+
},
136+
Subnetwork: fmt.Sprintf("regions/%s/subnetworks/default", region),
137+
},
135138
},
136139
Disks: []*compute.AttachedDisk{
137140
{

test/remote/ssh.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,48 @@ func (i *InstanceInfo) SSHCheckAlive() error {
100100
})
101101
}
102102

103+
func (i *InstanceInfo) DisableUdev() error {
104+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
105+
_, err := i.SSH("systemctl", "stop", "systemd-udevd")
106+
if err != nil {
107+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd: %v", err)
108+
return false, nil
109+
}
110+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-kernel.socket")
111+
if err != nil {
112+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-kernel.socket: %v", err)
113+
return false, nil
114+
}
115+
_, err = i.SSH("systemctl", "stop", "systemd-udevd-control.socket")
116+
if err != nil {
117+
klog.V(2).Infof("(will retry) failed to stop systemd-udevd-control.socket: %v", err)
118+
return false, nil
119+
}
120+
return true, nil
121+
})
122+
}
123+
124+
func (i *InstanceInfo) EnableUdev() error {
125+
return wait.Poll(5*time.Second, time.Minute, func() (bool, error) {
126+
_, err := i.SSH("systemctl", "start", "systemd-udevd")
127+
if err != nil {
128+
klog.V(2).Infof("(will retry) failed to start systemd-udevd: %v", err)
129+
return false, nil
130+
}
131+
_, err = i.SSH("systemctl", "start", "systemd-udevd-kernel.socket")
132+
if err != nil {
133+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-kernel.socket: %v", err)
134+
return false, nil
135+
}
136+
_, err = i.SSH("systemctl", "start", "systemd-udevd-control.socket")
137+
if err != nil {
138+
klog.V(2).Infof("(will retry) failed to start systemd-udevd-control.socket: %v", err)
139+
return false, nil
140+
}
141+
return true, nil
142+
})
143+
}
144+
103145
// runSSHCommand executes the ssh or scp command, adding the flag provided --ssh-options
104146
func runSSHCommand(cmd string, args ...string) (string, error) {
105147
if pk, ok := os.LookupEnv("JENKINS_GCE_SSH_PRIVATE_KEY_FILE"); ok {

0 commit comments

Comments
 (0)