| 1 | #! @PERL@ |
| 2 | ### |
| 3 | ### Remove an LVM snapshot, without falling foul of LVM bugs |
| 4 | ### |
| 5 | ### (c) 2011 Mark Wooding |
| 6 | ### |
| 7 | |
| 8 | ###----- Licensing notice --------------------------------------------------- |
| 9 | ### |
| 10 | ### This program is free software; you can redistribute it and/or modify |
| 11 | ### it under the terms of the GNU General Public License as published by |
| 12 | ### the Free Software Foundation; either version 2 of the License, or |
| 13 | ### (at your option) any later version. |
| 14 | ### |
| 15 | ### This program is distributed in the hope that it will be useful, |
| 16 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 18 | ### GNU General Public License for more details. |
| 19 | ### |
| 20 | ### You should have received a copy of the GNU General Public License |
| 21 | ### along with this program; if not, write to the Free Software Foundation, |
| 22 | ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 23 | |
| 24 | use Cwd qw(realpath); |
| 25 | use Errno qw(:POSIX); |
| 26 | use Fcntl qw(:mode); |
| 27 | use File::stat; |
| 28 | use Getopt::Long qw(:config gnu_compat bundling no_ignore_case); |
| 29 | use IO::Handle; |
| 30 | use Time::HiRes qw(time); |
| 31 | |
| 32 | our $VERSION = "@VERSION@"; |
| 33 | |
| 34 | ###-------------------------------------------------------------------------- |
| 35 | ### Utilities. |
| 36 | |
| 37 | ## Error handling and reporting. |
| 38 | (our $QUIS = $0) =~ s:^.*/::; |
| 39 | our $DEBUG = 0; |
| 40 | sub whine ($) { my ($msg) = @_; print STDERR "$QUIS: $msg\n"; } |
| 41 | sub burble ($) { my ($msg) = @_; whine $msg if $DEBUG; } |
| 42 | sub fail ($) { my ($msg) = @_; whine $msg; exit $! || ($? >> 8) || 255; } |
| 43 | |
| 44 | ## Cleanups. Call `cleanup BLOCK' to arrange to have BLOCK executed at the |
| 45 | ## end of the program. |
| 46 | our @CLEANUP = (); |
| 47 | sub runcleanups { for my $f (@CLEANUP) { &$f } } |
| 48 | END { runcleanups; } |
| 49 | $SIG{INT} = $SIG{TERM} = sub { |
| 50 | my $sig = shift; |
| 51 | runcleanups; |
| 52 | $SIG{$sig} = 'DEFAULT'; |
| 53 | kill $sig => $$; |
| 54 | }; |
| 55 | sub cleanup (&) { unshift @CLEANUP, $_[0]; } |
| 56 | |
| 57 | sub fixint ($) { my ($x) = @_; return $x =~ /^0/ ? oct $x : $x + 0; } |
| 58 | |
| 59 | ###-------------------------------------------------------------------------- |
| 60 | ### Device fiddling. |
| 61 | |
| 62 | sub devsys ($) { |
| 63 | ## devsys DEV |
| 64 | ## |
| 65 | ## Return a sysfs path for a device DEV. |
| 66 | |
| 67 | my ($dev) = @_; |
| 68 | my $st = stat $dev or fail "stat ($dev): $!"; |
| 69 | my $kind; |
| 70 | if (S_ISBLK($st->mode)) { $kind = "block"; } |
| 71 | elsif (S_ISCHR($st->mode)) { $kind = "char"; } |
| 72 | else { fail "$dev is not a device"; } |
| 73 | my ($maj, $min) = (($st->rdev >> 8) & 0xff, $st->rdev & 0xff); |
| 74 | (my $whole = realpath "/sys/dev/$kind/$maj:$min") =~ s:^/sys/:/:; |
| 75 | return $whole; |
| 76 | } |
| 77 | |
| 78 | our %DMTAB = (); |
| 79 | |
| 80 | sub dmtable_update () { |
| 81 | ## dmtable_update |
| 82 | ## |
| 83 | ## Update the device-mapper table in %DMTAB. |
| 84 | |
| 85 | burble "re-read device-mapper table"; |
| 86 | %DMTAB = (); |
| 87 | open my $dt, "-|", "dmsetup", "table" or fail "open (dm table): $!"; |
| 88 | while (my $line = $dt->getline) { |
| 89 | my ($dev, $rest) = split /[:\s]+/, $line, 2; |
| 90 | push @{$DMTAB{$dev}}, [split ' ', $rest]; |
| 91 | } |
| 92 | close $dt or fail "dmsetup table failed (rc = $?)"; |
| 93 | } |
| 94 | |
| 95 | sub dmname ($) { |
| 96 | ## dmname SYSPATH |
| 97 | ## |
| 98 | ## Return the device-mapper node name for the sysfs path SYSPATH. |
| 99 | |
| 100 | my ($sys) = @_; |
| 101 | open my $f, "<", "/sys$sys/dm/name" or fail "open ($sys/dm/name): $!"; |
| 102 | chomp (my $name = $f->getline); |
| 103 | close $f; |
| 104 | return $name; |
| 105 | } |
| 106 | |
| 107 | ###-------------------------------------------------------------------------- |
| 108 | ### I/O utilities. |
| 109 | |
| 110 | sub sel ($;$$$) { |
| 111 | ## sel TIMEOUT, [READS, WRITES, EXCEPTIONS] |
| 112 | ## |
| 113 | ## Wait for at most TIMEOUT seconds (indefinitely if TIMEOUT is `undef'). |
| 114 | ## Each of READS, WRITES and EXCEPTIONS is a listref containing FILE => SUB |
| 115 | ## pairs: if the FILE is readable (writable, has an exceptional condition) |
| 116 | ## then the SUB is invoked. |
| 117 | |
| 118 | my ($t, $r, $w, $x) = @_; |
| 119 | my ($vr, $vw, $vx); |
| 120 | my (%r, %w, %x); |
| 121 | |
| 122 | ## Read the arguments and build a data structure. |
| 123 | for my $i ([$r, \$vr, \%r], [$w, \$vw, \%w], [$x, \$vx, \%x]) { |
| 124 | my ($a, $v, $h) = @$i; |
| 125 | next unless $a; |
| 126 | my @a = @$a; |
| 127 | while (@a) { |
| 128 | my ($f, $g) = splice @a, 0, 2; |
| 129 | my $fd = $f->fileno; |
| 130 | $h->{$fd} = $g; |
| 131 | vec($$v, $fd, 1) = 1; |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | ## Do the wait and sift through the results. |
| 136 | defined select $vr, $vw, $vx, $t or fail "select: $!"; |
| 137 | for my $i ([$vr, \%r], [$vw, \%w], [$vx, \%x]) { |
| 138 | my ($v, $h) = @$i; |
| 139 | while (my ($f, $g) = each %$h) { |
| 140 | if (vec $v, $f, 1) { &$g; } |
| 141 | } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | sub doread ($;$) { |
| 146 | ## doread FILE, [LEN] |
| 147 | ## |
| 148 | ## Read LEN bytes (or a default amount) from FILE. If the file ends, |
| 149 | ## return undef. If reading would block then return an empty string. |
| 150 | ## Otherwise return he stuff. |
| 151 | |
| 152 | my ($f, $n) = @_; |
| 153 | $n = sysread $f, my $buf, $n // 4096; |
| 154 | if (!defined $n) { return "" if $! == EAGAIN; fail "read: $!"; } |
| 155 | elsif (!$n) { return undef; } |
| 156 | else { return $buf; } |
| 157 | } |
| 158 | |
| 159 | sub run ($$@) { |
| 160 | ## run WHAT, PROG, ARGS... |
| 161 | ## |
| 162 | ## Run PROG, passing it ARGS. Fails if PROG exits nonzero. |
| 163 | |
| 164 | my ($what, $prog, @args) = @_; |
| 165 | system($prog, @args) == 0 or fail "$prog ($what) failed (rc = $?)"; |
| 166 | } |
| 167 | |
| 168 | sub capture ($@) { |
| 169 | ## capture PROG, ARGS... |
| 170 | ## |
| 171 | ## Run PROG, passing it ARGS. Returns exit status, stdout, and stderr, as |
| 172 | ## strings. |
| 173 | |
| 174 | my ($prog, @args) = @_; |
| 175 | my ($out, $err) = ("", ""); |
| 176 | my ($outpipe_in, $outpipe_out, $errpipe_in, $errpipe_out); |
| 177 | pipe $outpipe_in, $outpipe_out or fail "pipe ($prog out): $!"; |
| 178 | pipe $errpipe_in, $errpipe_out or fail "pipe ($prog err): $!"; |
| 179 | defined (my $kid = fork) or fail "fork ($prog): $!"; |
| 180 | if ($kid == 0) { |
| 181 | close $outpipe_in |
| 182 | and close $errpipe_in |
| 183 | and open STDOUT, ">&", $outpipe_out |
| 184 | and open STDERR, ">&", $errpipe_out |
| 185 | and exec $prog, @args |
| 186 | or fail "exec $prog: $!"; |
| 187 | } |
| 188 | close $outpipe_out; |
| 189 | close $errpipe_out; |
| 190 | for (;;) { |
| 191 | my @r = (); |
| 192 | for my $i ([\$outpipe_in, \$out, "out"], |
| 193 | [\$errpipe_in, \$err, "err"]) { |
| 194 | my ($p, $b, $w) = @$i; |
| 195 | push @r, $$p => sub { |
| 196 | my $buf = doread $$p; |
| 197 | if (defined $buf) { $$b .= $buf; } |
| 198 | else { close $$p; $$p = undef; } |
| 199 | } if $$p; |
| 200 | } |
| 201 | last unless @r; |
| 202 | sel undef, \@r; |
| 203 | } |
| 204 | waitpid $kid, 0 or fail "waitpid ($prog): $!"; |
| 205 | return $?, $out, $err; |
| 206 | } |
| 207 | |
| 208 | ###-------------------------------------------------------------------------- |
| 209 | ### Monitoring udev events. |
| 210 | |
| 211 | sub umon_create (@) { |
| 212 | ## umon_create ARGS... |
| 213 | ## |
| 214 | ## Create a udev monitor, with the given `udevadm monitor' arguments, and |
| 215 | ## return an object. We always select only kernel events. We try to wait |
| 216 | ## for the monitor to start up before returning. Don't trust this: use |
| 217 | ## `umon_sync' anyway. |
| 218 | |
| 219 | my @args = @_; |
| 220 | my $u = {}; |
| 221 | |
| 222 | ## Start the monitor process. |
| 223 | $u->{KID} = open($u->{PIPE}, "-|", |
| 224 | "stdbuf", "-o0", |
| 225 | "udevadm", "monitor", "--kernel", "--property", @args) |
| 226 | or fail "open (umon): $!"; |
| 227 | cleanup { kill 9, $u->{KID} }; |
| 228 | $u->{PIPE}->blocking(0) or fail "set non-blocking (umon): $!"; |
| 229 | |
| 230 | ## Wait for the end of the preamble, indicated by the first blank line. |
| 231 | ## From observation with strace(1), this means that the monitor has |
| 232 | ## successfully attached itself to its netlink socket and is ready to fetch |
| 233 | ## events. |
| 234 | my $ok = 0; |
| 235 | my $buf = ""; |
| 236 | my $now = time; |
| 237 | my $end = $now + 5; |
| 238 | while (!$ok) { |
| 239 | sel |
| 240 | $end - $now, |
| 241 | [ $u->{PIPE} => sub { |
| 242 | defined (my $b = doread $u->{PIPE}) or fail "read (umon): eof"; |
| 243 | $buf .= $b; |
| 244 | if ($buf =~ /\n\n(.*)$/) { $ok = 1; $buf = $1; } |
| 245 | } |
| 246 | ]; |
| 247 | $now = time; |
| 248 | if ($now >= $end) { fail "umon timeout"; } |
| 249 | } |
| 250 | $u->{BUF} = $buf; |
| 251 | |
| 252 | ## Done. |
| 253 | return $u; |
| 254 | } |
| 255 | |
| 256 | sub umon_read ($) { |
| 257 | ## umon_read UMON |
| 258 | ## |
| 259 | ## Read events from UMON, as a list of hash references mapping properties |
| 260 | ## to their values. |
| 261 | |
| 262 | my ($u) = @_; |
| 263 | my @s = (); |
| 264 | for (;;) { |
| 265 | defined (my $buf = doread $u->{PIPE}) or fail "read (umon): end of file"; |
| 266 | $buf eq "" and last; |
| 267 | $buf = $u->{BUF} . $buf; |
| 268 | my @r = split /\n\n/, $buf, -1; |
| 269 | $u->{BUF} = pop @r; |
| 270 | for my $r (@r) { |
| 271 | push @s, { map { /^(\w+)=(.*)$/ } split /\n/, $r }; |
| 272 | } |
| 273 | } |
| 274 | return @s; |
| 275 | } |
| 276 | |
| 277 | sub umon_sync ($$) { |
| 278 | ## umon_sync UMON, DEV |
| 279 | ## |
| 280 | ## Wait for UMON to report an event about the device DEV (without its |
| 281 | ## `/dev/' prefix), triggering periodically just in case it missed one. |
| 282 | ## This is useful for synchronizing. Returns the list of events which |
| 283 | ## weren't interesting. |
| 284 | |
| 285 | my ($u, $dev) = @_; |
| 286 | my $now = time; |
| 287 | my $retry = 0; |
| 288 | my $done = 0; |
| 289 | my @ev = (); |
| 290 | burble "sync with udev"; |
| 291 | |
| 292 | until ($done) { |
| 293 | |
| 294 | ## Too late. Trigger a change event and try again. |
| 295 | if ($now >= $retry) { |
| 296 | $retry = $now + 2; |
| 297 | run "trigger $dev", "udevadm", "trigger", "--sysname-match=$dev"; |
| 298 | } |
| 299 | |
| 300 | ## Now read events and see what happens. |
| 301 | sel |
| 302 | $retry - $now, |
| 303 | [ $u->{PIPE} => sub { |
| 304 | my @e = umon_read $u; |
| 305 | while (@e) { |
| 306 | my $e = shift @e; |
| 307 | if ($e->{DEVNAME} eq $dev) { $done = 1; push @ev, @e; last; } |
| 308 | else { push @ev, $e; } |
| 309 | } |
| 310 | } |
| 311 | ]; |
| 312 | $now = time; |
| 313 | } |
| 314 | |
| 315 | return @ev; |
| 316 | } |
| 317 | |
| 318 | ###-------------------------------------------------------------------------- |
| 319 | ### Main code. |
| 320 | |
| 321 | ## Parse the command line. |
| 322 | our $USAGE = "usage: $QUIS VGNAME/LVNAME"; |
| 323 | sub version { print "$QUIS, version $VERSION\n"; } |
| 324 | sub help { |
| 325 | print <<EOF; |
| 326 | $USAGE |
| 327 | |
| 328 | Options: |
| 329 | -h, --help Show this help text. |
| 330 | -v, --version Show the program version number. |
| 331 | -d, --debug Show debugging information. |
| 332 | -n, --no-act Don't take corrective actions. |
| 333 | EOF |
| 334 | } |
| 335 | |
| 336 | our $NOACT = 0; |
| 337 | GetOptions('help|h|?' => sub { version; help; exit; }, |
| 338 | 'version|v' => sub { version; exit; }, |
| 339 | 'debug|d' => \$DEBUG, |
| 340 | 'noact|n' => \$NOACT) |
| 341 | and @ARGV == 1 |
| 342 | and @ARGV[0] =~ m:(.+)/(.+): |
| 343 | or do { print STDERR $USAGE, "\n"; exit 1; }; |
| 344 | our ($VG, $LV) = ($1, $2); |
| 345 | |
| 346 | ## Check that the volume in question actually exists, and is a device-mapper |
| 347 | ## device, before we wheel out the big guns. |
| 348 | dmtable_update; |
| 349 | our $SYS = devsys "/dev/$VG/$LV"; |
| 350 | burble "sysfs name is $SYS"; |
| 351 | my $t = $DMTAB{dmname $SYS} |
| 352 | or fail "/dev/$VG/$LV isn't a device-mapper device"; |
| 353 | if ($DEBUG) { |
| 354 | burble "found table..."; |
| 355 | burble "\t" . join " ", @$_ foreach @$t; |
| 356 | } |
| 357 | $t->[0][2] eq "snapshot" or fail "/dev/$VG/$LV isn't a snapshot"; |
| 358 | |
| 359 | ## Create a udev monitor. We're only interested in disk-shaped block |
| 360 | ## devices. (If we use some other device kind for synchronization then this |
| 361 | ## filter will have to be broadened.) |
| 362 | my $u = umon_create "--subsystem-match=block/disk"; |
| 363 | |
| 364 | ## Prepare for the awful synchronization hack. We need to make sure, below, |
| 365 | ## that we've read all of the interesting events resulting from an `lvremove' |
| 366 | ## call. To do this, we wait for an event on a different device -- but we |
| 367 | ## must avoid being fooled by spurious events on this device. As an attempt |
| 368 | ## to minimize the probability of this going wrong, acquire a pet device |
| 369 | ## which nobody else is using. The best idea seems to be a loopback device. |
| 370 | open my $lopipe, "-|", "losetup", "--show", "--find", "/etc/motd" |
| 371 | or fail "open (losetup attach)"; |
| 372 | chomp (my $lo = $lopipe->getline); |
| 373 | { local $/ = undef; <$lopipe>; } |
| 374 | $lo =~ s:^/dev/::; |
| 375 | $lopipe->close or fail "wait (losetup attach): $!"; |
| 376 | cleanup { system "losetup", "--detach", "/dev/$lo" }; |
| 377 | |
| 378 | ## Initial synchronization, to make sure stuff works. |
| 379 | umon_sync $u, $lo; |
| 380 | |
| 381 | ## Try to remove the snapshot. Capture stdout and stderr, and relay them if |
| 382 | ## nothing serious went wrong. |
| 383 | burble "initial attempt to remove snapshot"; |
| 384 | my ($rc, $out, $err) = capture "lvremove", "--force", "$VG/$LV"; |
| 385 | if ($rc != 0x500) { |
| 386 | print STDOUT $out; |
| 387 | print STDERR $err; |
| 388 | burble "lvremove didn't explode (rc = $rc): we're done here"; |
| 389 | if ($rc >> 8) { $rc >>= 8 } |
| 390 | elsif ($rc & 255) { $rc += 128 } |
| 391 | exit $rc; |
| 392 | } |
| 393 | burble "initial lvremove failed"; |
| 394 | |
| 395 | ## OK, stuff went wrong. First see if there was a udev cookie left over, and |
| 396 | ## if so try to release it. It's important to know that we've read all of |
| 397 | ## the relevant uevents, so synchronize again. |
| 398 | my @e = umon_sync $u, $lo; |
| 399 | my %c = (); |
| 400 | for my $e (@e) { |
| 401 | $c{($e->{DM_COOKIE} & 0xffff) | 0xd4d0000} = 1 |
| 402 | if $e->{DEVPATH} eq $SYS && exists $e->{DM_COOKIE}; |
| 403 | } |
| 404 | burble "cookies used: " . join ", ", map { sprintf "0x%x", $_ } keys %c; |
| 405 | |
| 406 | ## Find the used cookies which are still extant, and release them. |
| 407 | open $uc, "-|", "dmsetup", "udevcookies" or fail "open (cookies): $!"; |
| 408 | $uc->getline; |
| 409 | my @leak = (); |
| 410 | while (my $l = $uc->getline) { |
| 411 | my @f = split ' ', $l; |
| 412 | push @leak, $f[0] if $c{fixint $f[0]}; |
| 413 | } |
| 414 | close $uc or fail "udevcookies failed (rc = $?)"; |
| 415 | for my $c (@leak) { |
| 416 | burble "release leaked cookie $c"; |
| 417 | run "release cookie", "dmsetup", "udevreleasecookie", $c unless $NOACT; |
| 418 | } |
| 419 | |
| 420 | ## If we're very unlucky, the origin volume may still be suspended. Resume |
| 421 | ## it now, or the next attempt will get stuck. (Resuming is idempotent, so |
| 422 | ## we don't need to check whether it's already running.) Finding the origin |
| 423 | ## is annoying: search the device-mapper table for a device with a |
| 424 | ## `snapshot-origin' table referencing the same backing store as the |
| 425 | ## snapshot. |
| 426 | my $back = $DMTAB{dmname $SYS}[0][3]; |
| 427 | my $orig = undef; |
| 428 | burble "backend device $back"; |
| 429 | for my $dm (keys %DMTAB) { |
| 430 | my $t = $DMTAB{$dm}; |
| 431 | next unless @$t == 1 && |
| 432 | $t->[0][2] eq "snapshot-origin" && |
| 433 | $t->[0][3] eq $back; |
| 434 | defined $orig and fail "snapshot appears to have multiple origins"; |
| 435 | $orig = $dm; |
| 436 | } |
| 437 | defined $orig or fail "couldn't find snapshot origin device"; |
| 438 | burble "found origin volume $orig; resuming..."; |
| 439 | run "resume origin $orig", "dmsetup", "resume", $orig unless $NOACT; |
| 440 | |
| 441 | ## See whether removing the snapshot again helps any. |
| 442 | burble "retry snapshot removal"; |
| 443 | run "retry", "lvremove", "--force", "$VG/$LV" unless $NOACT; |
| 444 | |
| 445 | ## OK, we're on the way to recovery. The origin device may now be not a |
| 446 | ## snapshot-origin any more. Refresh the device-mapper table and inspect it. |
| 447 | dmtable_update; |
| 448 | if (-d "/sys/dev/block/$back") { |
| 449 | my $backdm = dmname "/dev/block/$back"; |
| 450 | if ($DMTAB{$orig}[0][2] ne "snapshot-origin") { |
| 451 | burble "origin released but backend $backdm still exists: remove"; |
| 452 | run "remove backend $backdm", "dmsetup", "remove", $backdm |
| 453 | unless $NOACT; |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | ## All done. There, that wasn't so bad, was it? |
| 458 | burble "completed successfully"; |
| 459 | exit 0; |
| 460 | |
| 461 | ###----- That's all, folks -------------------------------------------------- |