| 1 | /* -*-scala-*- |
| 2 | * |
| 3 | * Extract data from `tar' archives |
| 4 | * |
| 5 | * (c) 2018 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of the Trivial IP Encryption (TrIPE) Android app. |
| 11 | * |
| 12 | * TrIPE is free software: you can redistribute it and/or modify it under |
| 13 | * the terms of the GNU General Public License as published by the Free |
| 14 | * Software Foundation; either version 3 of the License, or (at your |
| 15 | * option) any later version. |
| 16 | * |
| 17 | * TrIPE is distributed in the hope that it will be useful, but WITHOUT |
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 20 | * for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU General Public License |
| 23 | * along with TrIPE. If not, see <https://www.gnu.org/licenses/>. |
| 24 | */ |
| 25 | |
| 26 | package uk.org.distorted.tripe; |
| 27 | |
| 28 | /*----- Imports -----------------------------------------------------------*/ |
| 29 | |
| 30 | import java.io.{Closeable, InputStream}; |
| 31 | import java.nio.ByteBuffer; |
| 32 | import java.nio.charset.Charset; |
| 33 | import java.util.Date; |
| 34 | |
| 35 | /*----- Main code ---------------------------------------------------------*/ |
| 36 | |
| 37 | class TarFormatError(msg: String) extends Exception(msg); |
| 38 | |
| 39 | trait TarEntry { |
| 40 | /* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit |
| 41 | * the trait inheritance circularity. So this is a cardboard cutout |
| 42 | * version of `Entry'. |
| 43 | */ |
| 44 | |
| 45 | /* Basic facts about the entry. */ |
| 46 | def name: String; |
| 47 | def size: Long; |
| 48 | def typ: Char; |
| 49 | def mode: Int; |
| 50 | def mtime: Date; |
| 51 | def uid: Int; |
| 52 | def gid: Int; |
| 53 | def link: String; |
| 54 | |
| 55 | /* Type predicates (intentionally like `FileInfo'). */ |
| 56 | def isfifo: Boolean = typ == '6'; |
| 57 | def ischr: Boolean = typ == '3'; |
| 58 | def isdir: Boolean = typ == '5'; |
| 59 | def isblk: Boolean = typ == '4'; |
| 60 | def isreg: Boolean = typ match { |
| 61 | case 0 | '0' | '7' => true |
| 62 | case _ => false |
| 63 | } |
| 64 | def islnk: Boolean = typ == '2'; |
| 65 | def issock: Boolean = false; |
| 66 | def ishardlink: Boolean = typ == '1'; |
| 67 | |
| 68 | def verbose: String = { |
| 69 | /* Encode information about this tar header as a string. */ |
| 70 | |
| 71 | val sb = new StringBuilder; |
| 72 | |
| 73 | /* First, the type code. */ |
| 74 | sb += (typ match { |
| 75 | case 0 | '0' | '7' => '-' |
| 76 | case '1' => 'L' |
| 77 | case '2' => 'l' |
| 78 | case '3' => 'c' |
| 79 | case '4' => 'b' |
| 80 | case '5' => 'd' |
| 81 | case '6' => '|' |
| 82 | case _ => '?' |
| 83 | }) |
| 84 | |
| 85 | /* Then the permissions bits. Ugh, the permissions bits. */ |
| 86 | def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) { |
| 87 | sb += (if ((mode&r) != 0) 'r' else '-'); |
| 88 | sb += (if ((mode&w) != 0) 'w' else '-'); |
| 89 | sb += (if ((mode&s) != 0) |
| 90 | if ((mode&x) != 0) schar else Schar; |
| 91 | else |
| 92 | if ((mode&x) != 0) 'x' else '-'); |
| 93 | } |
| 94 | perm(0x800, 0x100, 0x080, 0x040, 's', 'S'); |
| 95 | perm(0x400, 0x020, 0x010, 0x008, 's', 'S'); |
| 96 | perm(0x200, 0x004, 0x002, 0x001, 't', 'T'); |
| 97 | |
| 98 | /* And the rest, which is easy. */ |
| 99 | sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT%<tT%<tz $name%s"; |
| 100 | |
| 101 | /* Done. */ |
| 102 | sb.result |
| 103 | } |
| 104 | |
| 105 | override def toString(): String = s"${getClass.getName}($verbose)"; |
| 106 | |
| 107 | def stream: InputStream; |
| 108 | def withStream[T](body: InputStream => T): T = { |
| 109 | val s = stream; |
| 110 | try { body(s) } |
| 111 | finally { s.close(); } |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | class TarFile(in: InputStream) |
| 116 | extends LookaheadIterator[TarEntry] with Closeable { tar => |
| 117 | |
| 118 | /* Tokens are just objects, meaningful only for their identity. */ |
| 119 | private[TarFile] class Token; |
| 120 | |
| 121 | /* Some useful state. */ |
| 122 | private[TarFile] var offset: Long = 0; // current byte offset |
| 123 | private[this] var lockp = false; // locked by open entry? |
| 124 | private[this] var locktok = new Token; // active lock token |
| 125 | private[this] var nexthdr: Long = 0; // byte offset of next header |
| 126 | private[this] val hdr = new Array[Byte](512); // header under consideration |
| 127 | |
| 128 | /* Making sure we clean up properly. */ |
| 129 | override def close() { in.close(); } |
| 130 | override protected def finalize() { super.finalize(); close(); } |
| 131 | |
| 132 | private[this] def eoferr() |
| 133 | { throw new TarFormatError(s"unexpected EOF (at $offset)"); } |
| 134 | |
| 135 | /* Locking machinery. |
| 136 | * |
| 137 | * We work from a primitive `InputStream' which we can't seek. From this, |
| 138 | * we must be able to extract file contents, as an `InputStream', and parse |
| 139 | * file headers. We'll be badly lost if we lose track of where we are in |
| 140 | * the archive. |
| 141 | * |
| 142 | * So, there's a lock, which can be held by at most one actor at a time: |
| 143 | * either the `TarFile' itself, while it's (hopefully) reading a header |
| 144 | * block, or by the `Stream' object which lets the caller read an |
| 145 | * individual entry's content. Furthermore, if we start activating the |
| 146 | * per-entry streams out of order, we'll get confused about where we're |
| 147 | * meant to be, so there's also a `token' which represents a participant's |
| 148 | * right to claim the lock. The `TarFile' itself has special privileges |
| 149 | * and doesn't need a token, but the per-entry streams do, and only the |
| 150 | * stream associated with the most recently-read header is allowed to claim |
| 151 | * the lock. |
| 152 | */ |
| 153 | |
| 154 | private[this] def lock() { |
| 155 | /* Claim exclusive use of the input stream. */ |
| 156 | |
| 157 | if (lockp) throw new IllegalArgumentException("tarfile lock still held"); |
| 158 | lockp = true; |
| 159 | } |
| 160 | |
| 161 | private[TarFile] def lock(tok: Token) { |
| 162 | /* Claim exclusive use of the input stream, passing a token. */ |
| 163 | |
| 164 | if (tok ne locktok) |
| 165 | throw new IllegalArgumentException("stale lock token"); |
| 166 | lock(); |
| 167 | } |
| 168 | |
| 169 | private[TarFile] def unlock() { |
| 170 | /* Release the input stream so someone else can have a go. */ |
| 171 | |
| 172 | assert(lockp); |
| 173 | lockp = false; |
| 174 | locktok = new Token; |
| 175 | } |
| 176 | |
| 177 | /* Doing I/O on the input stream. |
| 178 | * |
| 179 | * Our `Stream' object sneakily grabs single bytes from the input. Given |
| 180 | * the way Scala works, we can't prevent that, so roll with it. |
| 181 | */ |
| 182 | |
| 183 | private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) { |
| 184 | /* Read input data into the indicated region of the buffer. Short reads |
| 185 | * are diagnosed as errors. Advances the cursor. |
| 186 | */ |
| 187 | |
| 188 | var pos = start; |
| 189 | val limit = start + len; |
| 190 | while (pos < len) { |
| 191 | val n = in.read(buf, pos, limit - pos); |
| 192 | if (n < 0) eoferr(); |
| 193 | pos += n; offset += n; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | private[TarFile] def skip(len: Long) { |
| 198 | /* Skip ahead LEN bytes in the archive. (The int/long discrepancy |
| 199 | * matches Java's bizarre `InputStream' protocol.) |
| 200 | */ |
| 201 | |
| 202 | var remain = len; |
| 203 | while (remain > 0) { |
| 204 | val n = in.skip(remain); |
| 205 | |
| 206 | if (n > 0) { remain -= n; offset += n; } |
| 207 | else { |
| 208 | /* It's hard to work out whether this means we've hit EOF or not. It |
| 209 | * seems best to check. We must have at least one byte left to skip |
| 210 | * or we wouldn't have started this iteration, so try to read that. |
| 211 | * If that works, then there's more stuff available and skipping |
| 212 | * isn't working, so start to read buffers and discard them. |
| 213 | */ |
| 214 | |
| 215 | if (in.read() == -1) eoferr(); |
| 216 | remain -= 1; offset += 1; |
| 217 | |
| 218 | /* Ugh. So, buffers it is then. */ |
| 219 | val buf = new Array[Byte]((remain min 4096).toInt); |
| 220 | while (remain >= buf.length) { |
| 221 | val n = (remain min buf.length).toInt; |
| 222 | read(buf, 0, n); |
| 223 | remain -= n; |
| 224 | } |
| 225 | } |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | private[TarFile] class Stream(end: Long, tok: Token) extends InputStream { |
| 230 | /* An input stream for a single archive entry's content. */ |
| 231 | |
| 232 | /* Claim the lock. If we're stale, this won't work. */ |
| 233 | lock(tok); |
| 234 | private[this] var open = true; |
| 235 | |
| 236 | private[this] def checkopen() { |
| 237 | /* Complain if the stream is closed. */ |
| 238 | |
| 239 | if (!lockp) throw new IllegalArgumentException("stream is closed"); |
| 240 | } |
| 241 | |
| 242 | override def read(): Int = { |
| 243 | /* Read one byte. Don't know why there isn't a default implementation |
| 244 | * of this. |
| 245 | */ |
| 246 | |
| 247 | checkopen(); |
| 248 | if (offset >= end) -1 |
| 249 | else { |
| 250 | val b = in.read(); |
| 251 | if (b == -1) eoferr(); |
| 252 | offset += 1; |
| 253 | b |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | override def read(buf: Array[Byte], start: Int, len: Int): Int = { |
| 258 | /* Read a block. */ |
| 259 | |
| 260 | checkopen(); |
| 261 | if (offset >= end) -1 |
| 262 | else { |
| 263 | var n = (len.toLong min (end - offset)).toInt; |
| 264 | tar.read(buf, start, n); |
| 265 | n |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | override def close() { |
| 270 | /* Release the lock. */ |
| 271 | |
| 272 | if (open) { unlock(); open = false; } |
| 273 | } |
| 274 | } |
| 275 | |
| 276 | private[this] class Entry(val name: String, val size: Long, |
| 277 | val typ: Char, val mode: Int, |
| 278 | val mtime: Date, |
| 279 | val uid: Int, val gid: Int, |
| 280 | val link: String, |
| 281 | end: Long, tok: Token) |
| 282 | extends TarEntry{ |
| 283 | /* See `TarEntry' for why we have this silliness. Most of the work is in |
| 284 | * the constructor above. |
| 285 | */ |
| 286 | |
| 287 | lazy val stream: InputStream = new Stream(end, tok); |
| 288 | } |
| 289 | |
| 290 | /* Utilities for parsing archive-entry header blocks. */ |
| 291 | |
| 292 | private[this] def string(off: Int, len: Int): String = { |
| 293 | /* Parse a string from the block header. POSIX.1-2008 says that header |
| 294 | * fields should be ISO/IEC 646, but strange things can turn up |
| 295 | * especially in filenames. I'm going to translate strings according to |
| 296 | * the local character set, because that will map most easily if a |
| 297 | * program tries to write out files from the archive with their |
| 298 | * associated names. |
| 299 | */ |
| 300 | |
| 301 | /* First, find the null terminator, if there is one. Scala doesn't make |
| 302 | * this especially easy. Rustle up a view to limit the search. |
| 303 | */ |
| 304 | val bview = hdr.view(off, off + len); |
| 305 | val n = bview.indexOf(0) match { |
| 306 | case -1 => len |
| 307 | case nul => nul |
| 308 | }; |
| 309 | |
| 310 | /* And then decode the relevant portion of the orignal buffer. */ |
| 311 | val dec = Charset.defaultCharset.newDecoder; |
| 312 | val in = ByteBuffer.wrap(hdr, off, n); |
| 313 | dec.decode(in).toString |
| 314 | } |
| 315 | |
| 316 | private[this] def number(off: Int, len: Int, max: Long): Long = { |
| 317 | /* Parse a number from the block header. POSIX.1-2008 says that numbers |
| 318 | * are in octal and terminated by space or nul. |
| 319 | */ |
| 320 | |
| 321 | var n = 0l; // accumulate the value |
| 322 | for (i <- off until off + len) { |
| 323 | val b = hdr(i); |
| 324 | |
| 325 | /* See if we're done now. */ |
| 326 | if (b == ' ' || b == 0) return n; |
| 327 | else if (b < '0' || b > '7') |
| 328 | throw new TarFormatError(s"bad octal digit (at ${offset + off + i})"); |
| 329 | |
| 330 | /* Convert to a digit. */ |
| 331 | val m = b - '0'; |
| 332 | |
| 333 | /* Check for overflow -- without overflowing. |
| 334 | * |
| 335 | * Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n > |
| 336 | * 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a |
| 337 | * little fraught because Scala has the wrong semantics when dividing |
| 338 | * negative integers. |
| 339 | */ |
| 340 | if (n > max/8 + (8 + max%8 - m)/8 - 1) |
| 341 | throw new TarFormatError(s"number out of range (at ${offset + off})"); |
| 342 | |
| 343 | /* Accumulate and go round again. */ |
| 344 | n = 8*n + (b - '0'); |
| 345 | } |
| 346 | unreachable; |
| 347 | } |
| 348 | |
| 349 | override protected def fetch(): Option[TarEntry] = { |
| 350 | /* Collect the next archive header and return it as a file entry. */ |
| 351 | |
| 352 | /* Make sure that we can actually do this. */ |
| 353 | withCleaner { clean => |
| 354 | lock(); clean { unlock(); } |
| 355 | |
| 356 | /* Skip ahead to the next header. */ |
| 357 | skip(nexthdr - offset); |
| 358 | |
| 359 | /* Read the header. The end of the archive is marked by two zero |
| 360 | * blocks, so the archive is broken if there isn't at least one here. |
| 361 | */ |
| 362 | read(hdr, 0, 512); |
| 363 | } |
| 364 | |
| 365 | /* If the block is entirely zero-filled then declare this file at an |
| 366 | * end. No good can come from checking the next block. |
| 367 | */ |
| 368 | if (hdr.forall(_ == 0)) return None; |
| 369 | |
| 370 | /* Verify the checksum. Pay attention because Java's bytes are |
| 371 | * (idiotically) signed. |
| 372 | */ |
| 373 | var ck: Int = 8*' '; // pretend chksum field is spaces |
| 374 | for (i <- 0 until 148) ck += hdr(i)&0xff; |
| 375 | for (i <- 156 until 512) ck += hdr(i)&0xff; |
| 376 | val wantck = number(148, 8, 0x20000); |
| 377 | if (ck != wantck) { |
| 378 | throw new TarFormatError( |
| 379 | s"invalid checksum $ck /= $wantck (at $nexthdr)"); |
| 380 | } |
| 381 | |
| 382 | /* Fetch the `magic' and `version' fields. If this is a proper POSIX |
| 383 | * `ustar' file then special processing will apply. |
| 384 | */ |
| 385 | val magic = string(257, 6); |
| 386 | val version = string(263, 2); |
| 387 | val posixp = magic == "ustar" && version == "00"; |
| 388 | |
| 389 | /* Figure out this entry's name. If this is a POSIX archive, then part |
| 390 | * of the name is stashed at the end of the header because of old, bad |
| 391 | * decisions. But don't look there unless we're sure because old GNU |
| 392 | * `tar' used that space for other things. |
| 393 | */ |
| 394 | val name = { |
| 395 | val tail = string(0, 100); |
| 396 | if (!posixp || hdr(345) == 0) tail |
| 397 | else { |
| 398 | val prefix = string(345, 155); |
| 399 | prefix + '/' + tail |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | /* Read some other easy stuff. */ |
| 404 | val mode = number(100, 8, 0xfff).toInt; |
| 405 | val uid = number(108, 8, Int.MaxValue).toInt; |
| 406 | val gid = number(116, 8, Int.MaxValue).toInt; |
| 407 | val typ = hdr(156).toChar; |
| 408 | val mtime = number(136, 12, Long.MaxValue); |
| 409 | |
| 410 | /* The size is irrelevant, and possibly even misleading, for some entry |
| 411 | * types. We're not interested, for example, in systems where |
| 412 | * directories need to be preallocated. |
| 413 | */ |
| 414 | val size = typ match { |
| 415 | case '1' | '2' | '3' | '4' | '5' | '6' => 0 |
| 416 | case _ => number(124, 12, Long.MaxValue) |
| 417 | } |
| 418 | |
| 419 | /* Maybe fetch the link name. */ |
| 420 | val link = typ match { |
| 421 | case '1' | '2' => string(157, 100) |
| 422 | case _ => "" |
| 423 | } |
| 424 | |
| 425 | /* Figure out where the next header ought to be. */ |
| 426 | nexthdr = (offset + size + 511)& -512; |
| 427 | |
| 428 | /* Return the finished archive entry. */ |
| 429 | Some(new Entry(name, size, typ, mode, |
| 430 | new Date(1000*mtime), uid, gid, link, |
| 431 | offset + size, locktok)); |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | /* Example: |
| 436 | * |
| 437 | * for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg) |
| 438 | * e withStream { in => |
| 439 | * val h = java.security.MessageDigest.getInstance("SHA-256"); |
| 440 | * for ((buf, n) <- in.blocks) h.update(b, 0, n); |
| 441 | * val hex = new String(h.digest flatMap { _.formatted("%02x") }); |
| 442 | * println("s$hex ${e.name}"); |
| 443 | * } |
| 444 | */ |
| 445 | |
| 446 | /*----- That's all, folks -------------------------------------------------*/ |