Commit | Line | Data |
---|---|---|
99248ed2 | 1 | #! @PERL@ |
99248ed2 MW |
2 | ### |
3 | ### Remove an LVM snapshot, without falling foul of LVM bugs | |
4 | ### | |
5 | ### (c) 2011 Mark Wooding | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
10 | ### This program is free software; you can redistribute it and/or modify | |
11 | ### it under the terms of the GNU General Public License as published by | |
12 | ### the Free Software Foundation; either version 2 of the License, or | |
13 | ### (at your option) any later version. | |
14 | ### | |
15 | ### This program is distributed in the hope that it will be useful, | |
16 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ### GNU General Public License for more details. | |
19 | ### | |
20 | ### You should have received a copy of the GNU General Public License | |
21 | ### along with this program; if not, write to the Free Software Foundation, | |
22 | ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
23 | ||
24 | use Cwd qw(realpath); | |
25 | use Errno qw(:POSIX); | |
26 | use Fcntl qw(:mode); | |
27 | use File::stat; | |
28 | use Getopt::Long qw(:config gnu_compat bundling no_ignore_case); | |
29 | use IO::Handle; | |
30 | use Time::HiRes qw(time); | |
31 | ||
32 | our $VERSION = "@VERSION@"; | |
33 | ||
34 | ###-------------------------------------------------------------------------- | |
35 | ### Utilities. | |
36 | ||
37 | ## Error handling and reporting. | |
38 | (our $QUIS = $0) =~ s:^.*/::; | |
39 | our $DEBUG = 0; | |
40 | sub whine ($) { my ($msg) = @_; print STDERR "$QUIS: $msg\n"; } | |
41 | sub burble ($) { my ($msg) = @_; whine $msg if $DEBUG; } | |
42 | sub fail ($) { my ($msg) = @_; whine $msg; exit $! || ($? >> 8) || 255; } | |
43 | ||
44 | ## Cleanups. Call `cleanup BLOCK' to arrange to have BLOCK executed at the | |
45 | ## end of the program. | |
46 | our @CLEANUP = (); | |
47 | sub runcleanups { for my $f (@CLEANUP) { &$f } } | |
48 | END { runcleanups; } | |
49 | $SIG{INT} = $SIG{TERM} = sub { | |
50 | my $sig = shift; | |
51 | runcleanups; | |
52 | $SIG{$sig} = 'DEFAULT'; | |
53 | kill $sig => $$; | |
54 | }; | |
55 | sub cleanup (&) { unshift @CLEANUP, $_[0]; } | |
56 | ||
57 | sub fixint ($) { my ($x) = @_; return $x =~ /^0/ ? oct $x : $x + 0; } | |
58 | ||
59 | ###-------------------------------------------------------------------------- | |
60 | ### Device fiddling. | |
61 | ||
62 | sub devsys ($) { | |
63 | ## devsys DEV | |
64 | ## | |
65 | ## Return a sysfs path for a device DEV. | |
66 | ||
67 | my ($dev) = @_; | |
68 | my $st = stat $dev or fail "stat ($dev): $!"; | |
69 | my $kind; | |
70 | if (S_ISBLK($st->mode)) { $kind = "block"; } | |
71 | elsif (S_ISCHR($st->mode)) { $kind = "char"; } | |
72 | else { fail "$dev is not a device"; } | |
73 | my ($maj, $min) = (($st->rdev >> 8) & 0xff, $st->rdev & 0xff); | |
74 | (my $whole = realpath "/sys/dev/$kind/$maj:$min") =~ s:^/sys/:/:; | |
75 | return $whole; | |
76 | } | |
77 | ||
78 | our %DMTAB = (); | |
79 | ||
80 | sub dmtable_update () { | |
81 | ## dmtable_update | |
82 | ## | |
83 | ## Update the device-mapper table in %DMTAB. | |
84 | ||
85 | burble "re-read device-mapper table"; | |
86 | %DMTAB = (); | |
87 | open my $dt, "-|", "dmsetup", "table" or fail "open (dm table): $!"; | |
88 | while (my $line = $dt->getline) { | |
89 | my ($dev, $rest) = split /[:\s]+/, $line, 2; | |
90 | push @{$DMTAB{$dev}}, [split ' ', $rest]; | |
91 | } | |
92 | close $dt or fail "dmsetup table failed (rc = $?)"; | |
93 | } | |
94 | ||
95 | sub dmname ($) { | |
96 | ## dmname SYSPATH | |
97 | ## | |
98 | ## Return the device-mapper node name for the sysfs path SYSPATH. | |
99 | ||
100 | my ($sys) = @_; | |
101 | open my $f, "<", "/sys$sys/dm/name" or fail "open ($sys/dm/name): $!"; | |
102 | chomp (my $name = $f->getline); | |
103 | close $f; | |
104 | return $name; | |
105 | } | |
106 | ||
107 | ###-------------------------------------------------------------------------- | |
108 | ### I/O utilities. | |
109 | ||
110 | sub sel ($;$$$) { | |
111 | ## sel TIMEOUT, [READS, WRITES, EXCEPTIONS] | |
112 | ## | |
113 | ## Wait for at most TIMEOUT seconds (indefinitely if TIMEOUT is `undef'). | |
114 | ## Each of READS, WRITES and EXCEPTIONS is a listref containing FILE => SUB | |
115 | ## pairs: if the FILE is readable (writable, has an exceptional condition) | |
116 | ## then the SUB is invoked. | |
117 | ||
118 | my ($t, $r, $w, $x) = @_; | |
119 | my ($vr, $vw, $vx); | |
120 | my (%r, %w, %x); | |
121 | ||
122 | ## Read the arguments and build a data structure. | |
123 | for my $i ([$r, \$vr, \%r], [$w, \$vw, \%w], [$x, \$vx, \%x]) { | |
124 | my ($a, $v, $h) = @$i; | |
125 | next unless $a; | |
126 | my @a = @$a; | |
127 | while (@a) { | |
128 | my ($f, $g) = splice @a, 0, 2; | |
129 | my $fd = $f->fileno; | |
130 | $h->{$fd} = $g; | |
131 | vec($$v, $fd, 1) = 1; | |
132 | } | |
133 | } | |
134 | ||
135 | ## Do the wait and sift through the results. | |
136 | defined select $vr, $vw, $vx, $t or fail "select: $!"; | |
137 | for my $i ([$vr, \%r], [$vw, \%w], [$vx, \%x]) { | |
138 | my ($v, $h) = @$i; | |
139 | while (my ($f, $g) = each %$h) { | |
140 | if (vec $v, $f, 1) { &$g; } | |
141 | } | |
142 | } | |
143 | } | |
144 | ||
145 | sub doread ($;$) { | |
146 | ## doread FILE, [LEN] | |
147 | ## | |
148 | ## Read LEN bytes (or a default amount) from FILE. If the file ends, | |
149 | ## return undef. If reading would block then return an empty string. | |
150 | ## Otherwise return he stuff. | |
151 | ||
152 | my ($f, $n) = @_; | |
153 | $n = sysread $f, my $buf, $n // 4096; | |
154 | if (!defined $n) { return "" if $! == EAGAIN; fail "read: $!"; } | |
155 | elsif (!$n) { return undef; } | |
156 | else { return $buf; } | |
157 | } | |
158 | ||
159 | sub run ($$@) { | |
160 | ## run WHAT, PROG, ARGS... | |
161 | ## | |
162 | ## Run PROG, passing it ARGS. Fails if PROG exits nonzero. | |
163 | ||
164 | my ($what, $prog, @args) = @_; | |
165 | system($prog, @args) == 0 or fail "$prog ($what) failed (rc = $?)"; | |
166 | } | |
167 | ||
168 | sub capture ($@) { | |
169 | ## capture PROG, ARGS... | |
170 | ## | |
171 | ## Run PROG, passing it ARGS. Returns exit status, stdout, and stderr, as | |
172 | ## strings. | |
173 | ||
174 | my ($prog, @args) = @_; | |
175 | my ($out, $err) = ("", ""); | |
176 | my ($outpipe_in, $outpipe_out, $errpipe_in, $errpipe_out); | |
177 | pipe $outpipe_in, $outpipe_out or fail "pipe ($prog out): $!"; | |
178 | pipe $errpipe_in, $errpipe_out or fail "pipe ($prog err): $!"; | |
179 | defined (my $kid = fork) or fail "fork ($prog): $!"; | |
180 | if ($kid == 0) { | |
181 | close $outpipe_in | |
182 | and close $errpipe_in | |
183 | and open STDOUT, ">&", $outpipe_out | |
184 | and open STDERR, ">&", $errpipe_out | |
185 | and exec $prog, @args | |
186 | or fail "exec $prog: $!"; | |
187 | } | |
188 | close $outpipe_out; | |
189 | close $errpipe_out; | |
190 | for (;;) { | |
191 | my @r = (); | |
192 | for my $i ([\$outpipe_in, \$out, "out"], | |
193 | [\$errpipe_in, \$err, "err"]) { | |
194 | my ($p, $b, $w) = @$i; | |
195 | push @r, $$p => sub { | |
196 | my $buf = doread $$p; | |
197 | if (defined $buf) { $$b .= $buf; } | |
198 | else { close $$p; $$p = undef; } | |
199 | } if $$p; | |
200 | } | |
201 | last unless @r; | |
202 | sel undef, \@r; | |
203 | } | |
204 | waitpid $kid, 0 or fail "waitpid ($prog): $!"; | |
205 | return $?, $out, $err; | |
206 | } | |
207 | ||
208 | ###-------------------------------------------------------------------------- | |
209 | ### Monitoring udev events. | |
210 | ||
211 | sub umon_create (@) { | |
212 | ## umon_create ARGS... | |
213 | ## | |
214 | ## Create a udev monitor, with the given `udevadm monitor' arguments, and | |
215 | ## return an object. We always select only kernel events. We try to wait | |
216 | ## for the monitor to start up before returning. Don't trust this: use | |
217 | ## `umon_sync' anyway. | |
218 | ||
219 | my @args = @_; | |
220 | my $u = {}; | |
221 | ||
222 | ## Start the monitor process. | |
223 | $u->{KID} = open($u->{PIPE}, "-|", | |
224 | "stdbuf", "-o0", | |
225 | "udevadm", "monitor", "--kernel", "--property", @args) | |
226 | or fail "open (umon): $!"; | |
227 | cleanup { kill 9, $u->{KID} }; | |
228 | $u->{PIPE}->blocking(0) or fail "set non-blocking (umon): $!"; | |
229 | ||
230 | ## Wait for the end of the preamble, indicated by the first blank line. | |
231 | ## From observation with strace(1), this means that the monitor has | |
232 | ## successfully attached itself to its netlink socket and is ready to fetch | |
233 | ## events. | |
234 | my $ok = 0; | |
235 | my $buf = ""; | |
236 | my $now = time; | |
237 | my $end = $now + 5; | |
238 | while (!$ok) { | |
239 | sel | |
240 | $end - $now, | |
241 | [ $u->{PIPE} => sub { | |
242 | defined (my $b = doread $u->{PIPE}) or fail "read (umon): eof"; | |
243 | $buf .= $b; | |
244 | if ($buf =~ /\n\n(.*)$/) { $ok = 1; $buf = $1; } | |
245 | } | |
246 | ]; | |
247 | $now = time; | |
248 | if ($now >= $end) { fail "umon timeout"; } | |
249 | } | |
250 | $u->{BUF} = $buf; | |
251 | ||
252 | ## Done. | |
253 | return $u; | |
254 | } | |
255 | ||
256 | sub umon_read ($) { | |
257 | ## umon_read UMON | |
258 | ## | |
259 | ## Read events from UMON, as a list of hash references mapping properties | |
260 | ## to their values. | |
261 | ||
262 | my ($u) = @_; | |
263 | my @s = (); | |
264 | for (;;) { | |
265 | defined (my $buf = doread $u->{PIPE}) or fail "read (umon): end of file"; | |
266 | $buf eq "" and last; | |
267 | $buf = $u->{BUF} . $buf; | |
268 | my @r = split /\n\n/, $buf, -1; | |
269 | $u->{BUF} = pop @r; | |
270 | for my $r (@r) { | |
271 | push @s, { map { /^(\w+)=(.*)$/ } split /\n/, $r }; | |
272 | } | |
273 | } | |
274 | return @s; | |
275 | } | |
276 | ||
277 | sub umon_sync ($$) { | |
278 | ## umon_sync UMON, DEV | |
279 | ## | |
280 | ## Wait for UMON to report an event about the device DEV (without its | |
281 | ## `/dev/' prefix), triggering periodically just in case it missed one. | |
282 | ## This is useful for synchronizing. Returns the list of events which | |
283 | ## weren't interesting. | |
284 | ||
285 | my ($u, $dev) = @_; | |
286 | my $now = time; | |
287 | my $retry = 0; | |
288 | my $done = 0; | |
289 | my @ev = (); | |
290 | burble "sync with udev"; | |
291 | ||
292 | until ($done) { | |
293 | ||
294 | ## Too late. Trigger a change event and try again. | |
295 | if ($now >= $retry) { | |
296 | $retry = $now + 2; | |
297 | run "trigger $dev", "udevadm", "trigger", "--sysname-match=$dev"; | |
298 | } | |
299 | ||
300 | ## Now read events and see what happens. | |
301 | sel | |
302 | $retry - $now, | |
303 | [ $u->{PIPE} => sub { | |
304 | my @e = umon_read $u; | |
305 | while (@e) { | |
306 | my $e = shift @e; | |
307 | if ($e->{DEVNAME} eq $dev) { $done = 1; push @ev, @e; last; } | |
308 | else { push @ev, $e; } | |
309 | } | |
310 | } | |
311 | ]; | |
312 | $now = time; | |
313 | } | |
314 | ||
315 | return @ev; | |
316 | } | |
317 | ||
318 | ###-------------------------------------------------------------------------- | |
319 | ### Main code. | |
320 | ||
321 | ## Parse the command line. | |
322 | our $USAGE = "usage: $QUIS VGNAME/LVNAME"; | |
323 | sub version { print "$QUIS, version $VERSION\n"; } | |
324 | sub help { | |
325 | print <<EOF; | |
326 | $USAGE | |
327 | ||
328 | Options: | |
329 | -h, --help Show this help text. | |
330 | -v, --version Show the program version number. | |
331 | -d, --debug Show debugging information. | |
332 | -n, --no-act Don't take corrective actions. | |
333 | EOF | |
334 | } | |
335 | ||
336 | our $NOACT = 0; | |
337 | GetOptions('help|h|?' => sub { version; help; exit; }, | |
338 | 'version|v' => sub { version; exit; }, | |
339 | 'debug|d' => \$DEBUG, | |
340 | 'noact|n' => \$NOACT) | |
341 | and @ARGV == 1 | |
342 | and @ARGV[0] =~ m:(.+)/(.+): | |
343 | or do { print STDERR $USAGE, "\n"; exit 1; }; | |
344 | our ($VG, $LV) = ($1, $2); | |
345 | ||
346 | ## Check that the volume in question actually exists, and is a device-mapper | |
347 | ## device, before we wheel out the big guns. | |
348 | dmtable_update; | |
349 | our $SYS = devsys "/dev/$VG/$LV"; | |
350 | burble "sysfs name is $SYS"; | |
351 | my $t = $DMTAB{dmname $SYS} | |
352 | or fail "/dev/$VG/$LV isn't a device-mapper device"; | |
353 | if ($DEBUG) { | |
354 | burble "found table..."; | |
355 | burble "\t" . join " ", @$_ foreach @$t; | |
356 | } | |
357 | $t->[0][2] eq "snapshot" or fail "/dev/$VG/$LV isn't a snapshot"; | |
358 | ||
359 | ## Create a udev monitor. We're only interested in disk-shaped block | |
360 | ## devices. (If we use some other device kind for synchronization then this | |
361 | ## filter will have to be broadened.) | |
362 | my $u = umon_create "--subsystem-match=block/disk"; | |
363 | ||
364 | ## Prepare for the awful synchronization hack. We need to make sure, below, | |
365 | ## that we've read all of the interesting events resulting from an `lvremove' | |
366 | ## call. To do this, we wait for an event on a different device -- but we | |
367 | ## must avoid being fooled by spurious events on this device. As an attempt | |
368 | ## to minimize the probability of this going wrong, acquire a pet device | |
369 | ## which nobody else is using. The best idea seems to be a loopback device. | |
370 | open my $lopipe, "-|", "losetup", "--show", "--find", "/etc/motd" | |
371 | or fail "open (losetup attach)"; | |
372 | chomp (my $lo = $lopipe->getline); | |
373 | { local $/ = undef; <$lopipe>; } | |
374 | $lo =~ s:^/dev/::; | |
375 | $lopipe->close or fail "wait (losetup attach): $!"; | |
376 | cleanup { system "losetup", "--detach", "/dev/$lo" }; | |
377 | ||
378 | ## Initial synchronization, to make sure stuff works. | |
379 | umon_sync $u, $lo; | |
380 | ||
381 | ## Try to remove the snapshot. Capture stdout and stderr, and relay them if | |
382 | ## nothing serious went wrong. | |
383 | burble "initial attempt to remove snapshot"; | |
384 | my ($rc, $out, $err) = capture "lvremove", "--force", "$VG/$LV"; | |
385 | if ($rc != 0x500) { | |
386 | print STDOUT $out; | |
387 | print STDERR $err; | |
388 | burble "lvremove didn't explode (rc = $rc): we're done here"; | |
389 | if ($rc >> 8) { $rc >>= 8 } | |
390 | elsif ($rc & 255) { $rc += 128 } | |
391 | exit $rc; | |
392 | } | |
393 | burble "initial lvremove failed"; | |
394 | ||
395 | ## OK, stuff went wrong. First see if there was a udev cookie left over, and | |
396 | ## if so try to release it. It's important to know that we've read all of | |
397 | ## the relevant uevents, so synchronize again. | |
398 | my @e = umon_sync $u, $lo; | |
399 | my %c = (); | |
400 | for my $e (@e) { | |
401 | $c{($e->{DM_COOKIE} & 0xffff) | 0xd4d0000} = 1 | |
402 | if $e->{DEVPATH} eq $SYS && exists $e->{DM_COOKIE}; | |
403 | } | |
404 | burble "cookies used: " . join ", ", map { sprintf "0x%x", $_ } keys %c; | |
405 | ||
406 | ## Find the used cookies which are still extant, and release them. | |
407 | open $uc, "-|", "dmsetup", "udevcookies" or fail "open (cookies): $!"; | |
408 | $uc->getline; | |
409 | my @leak = (); | |
410 | while (my $l = $uc->getline) { | |
411 | my @f = split ' ', $l; | |
412 | push @leak, $f[0] if $c{fixint $f[0]}; | |
413 | } | |
414 | close $uc or fail "udevcookies failed (rc = $?)"; | |
415 | for my $c (@leak) { | |
416 | burble "release leaked cookie $c"; | |
417 | run "release cookie", "dmsetup", "udevreleasecookie", $c unless $NOACT; | |
418 | } | |
419 | ||
420 | ## If we're very unlucky, the origin volume may still be suspended. Resume | |
421 | ## it now, or the next attempt will get stuck. (Resuming is idempotent, so | |
422 | ## we don't need to check whether it's already running.) Finding the origin | |
423 | ## is annoying: search the device-mapper table for a device with a | |
424 | ## `snapshot-origin' table referencing the same backing store as the | |
425 | ## snapshot. | |
426 | my $back = $DMTAB{dmname $SYS}[0][3]; | |
427 | my $orig = undef; | |
428 | burble "backend device $back"; | |
429 | for my $dm (keys %DMTAB) { | |
430 | my $t = $DMTAB{$dm}; | |
431 | next unless @$t == 1 && | |
432 | $t->[0][2] eq "snapshot-origin" && | |
433 | $t->[0][3] eq $back; | |
434 | defined $orig and fail "snapshot appears to have multiple origins"; | |
435 | $orig = $dm; | |
436 | } | |
437 | defined $orig or fail "couldn't find snapshot origin device"; | |
438 | burble "found origin volume $orig; resuming..."; | |
439 | run "resume origin $orig", "dmsetup", "resume", $orig unless $NOACT; | |
440 | ||
441 | ## See whether removing the snapshot again helps any. | |
442 | burble "retry snapshot removal"; | |
443 | run "retry", "lvremove", "--force", "$VG/$LV" unless $NOACT; | |
444 | ||
445 | ## OK, we're on the way to recovery. The origin device may now be not a | |
446 | ## snapshot-origin any more. Refresh the device-mapper table and inspect it. | |
447 | dmtable_update; | |
448 | if (-d "/sys/dev/block/$back") { | |
449 | my $backdm = dmname "/dev/block/$back"; | |
450 | if ($DMTAB{$orig}[0][2] ne "snapshot-origin") { | |
451 | burble "origin released but backend $backdm still exists: remove"; | |
452 | run "remove backend $backdm", "dmsetup", "remove", $backdm | |
453 | unless $NOACT; | |
454 | } | |
455 | } | |
456 | ||
457 | ## All done. There, that wasn't so bad, was it? | |
458 | burble "completed successfully"; | |
459 | exit 0; | |
460 | ||
461 | ###----- That's all, folks -------------------------------------------------- |