+/* -*-scala-*-
+ *
+ * Extract data from `tar' archives
+ *
+ * (c) 2018 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of the Trivial IP Encryption (TrIPE) Android app.
+ *
+ * TrIPE is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your
+ * option) any later version.
+ *
+ * TrIPE is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with TrIPE. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+package uk.org.distorted.tripe;
+
+/*----- Imports -----------------------------------------------------------*/
+
+import java.io.{Closeable, InputStream};
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.Date;
+
+/*----- Main code ---------------------------------------------------------*/
+
+class TarFormatError(msg: String) extends Exception(msg);
+
+trait TarEntry {
+ /* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit
+ * the trait inheritance circularity. So this is a cardboard cutout
+ * version of `Entry'.
+ */
+
+ /* Basic facts about the entry. */
+ def name: String;
+ def size: Long;
+ def typ: Char;
+ def mode: Int;
+ def mtime: Date;
+ def uid: Int;
+ def gid: Int;
+ def link: String;
+
+ /* Type predicates (intentionally like `FileInfo'). */
+ def isfifo: Boolean = typ == '6';
+ def ischr: Boolean = typ == '3';
+ def isdir: Boolean = typ == '5';
+ def isblk: Boolean = typ == '4';
+ def isreg: Boolean = typ match {
+ case 0 | '0' | '7' => true
+ case _ => false
+ }
+ def islnk: Boolean = typ == '2';
+ def issock: Boolean = false;
+ def ishardlink: Boolean = typ == '1';
+
+ def verbose: String = {
+ /* Encode information about this tar header as a string. */
+
+ val sb = new StringBuilder;
+
+ /* First, the type code. */
+ sb += (typ match {
+ case 0 | '0' | '7' => '-'
+ case '1' => 'L'
+ case '2' => 'l'
+ case '3' => 'c'
+ case '4' => 'b'
+ case '5' => 'd'
+ case '6' => '|'
+ case _ => '?'
+ })
+
+ /* Then the permissions bits. Ugh, the permissions bits. */
+ def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) {
+ sb += (if ((mode&r) != 0) 'r' else '-');
+ sb += (if ((mode&w) != 0) 'w' else '-');
+ sb += (if ((mode&s) != 0)
+ if ((mode&x) != 0) schar else Schar;
+ else
+ if ((mode&x) != 0) 'x' else '-');
+ }
+ perm(0x800, 0x100, 0x080, 0x040, 's', 'S');
+ perm(0x400, 0x020, 0x010, 0x008, 's', 'S');
+ perm(0x200, 0x004, 0x002, 0x001, 't', 'T');
+
+ /* And the rest, which is easy. */
+ sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT%<tT%<tz $name%s";
+
+ /* Done. */
+ sb.result
+ }
+
+ override def toString(): String = s"${getClass.getName}($verbose)";
+
+ def stream: InputStream;
+ def withStream[T](body: InputStream => T): T = {
+ val s = stream;
+ try { body(s) }
+ finally { s.close(); }
+ }
+}
+
+class TarFile(in: InputStream)
+ extends LookaheadIterator[TarEntry] with Closeable { tar =>
+
+ /* Tokens are just objects, meaningful only for their identity. */
+ private[TarFile] class Token;
+
+ /* Some useful state. */
+ private[TarFile] var offset: Long = 0; // current byte offset
+ private[this] var lockp = false; // locked by open entry?
+ private[this] var locktok = new Token; // active lock token
+ private[this] var nexthdr: Long = 0; // byte offset of next header
+ private[this] val hdr = new Array[Byte](512); // header under consideration
+
+ /* Making sure we clean up properly. */
+ override def close() { in.close(); }
+ override protected def finalize() { super.finalize(); close(); }
+
+ private[this] def eoferr()
+ { throw new TarFormatError(s"unexpected EOF (at $offset)"); }
+
+ /* Locking machinery.
+ *
+ * We work from a primitive `InputStream' which we can't seek. From this,
+ * we must be able to extract file contents, as an `InputStream', and parse
+ * file headers. We'll be badly lost if we lose track of where we are in
+ * the archive.
+ *
+ * So, there's a lock, which can be held by at most one actor at a time:
+ * either the `TarFile' itself, while it's (hopefully) reading a header
+ * block, or by the `Stream' object which lets the caller read an
+ * individual entry's content. Furthermore, if we start activating the
+ * per-entry streams out of order, we'll get confused about where we're
+ * meant to be, so there's also a `token' which represents a participant's
+ * right to claim the lock. The `TarFile' itself has special privileges
+ * and doesn't need a token, but the per-entry streams do, and only the
+ * stream associated with the most recently-read header is allowed to claim
+ * the lock.
+ */
+
+ private[this] def lock() {
+ /* Claim exclusive use of the input stream. */
+
+ if (lockp) throw new IllegalArgumentException("tarfile lock still held");
+ lockp = true;
+ }
+
+ private[TarFile] def lock(tok: Token) {
+ /* Claim exclusive use of the input stream, passing a token. */
+
+ if (tok ne locktok)
+ throw new IllegalArgumentException("stale lock token");
+ lock();
+ }
+
+ private[TarFile] def unlock() {
+ /* Release the input stream so someone else can have a go. */
+
+ assert(lockp);
+ lockp = false;
+ locktok = new Token;
+ }
+
+ /* Doing I/O on the input stream.
+ *
+ * Our `Stream' object sneakily grabs single bytes from the input. Given
+ * the way Scala works, we can't prevent that, so roll with it.
+ */
+
+ private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) {
+ /* Read input data into the indicated region of the buffer. Short reads
+ * are diagnosed as errors. Advances the cursor.
+ */
+
+ var pos = start;
+ val limit = start + len;
+ while (pos < len) {
+ val n = in.read(buf, pos, limit - pos);
+ if (n < 0) eoferr();
+ pos += n; offset += n;
+ }
+ }
+
+ private[TarFile] def skip(len: Long) {
+ /* Skip ahead LEN bytes in the archive. (The int/long discrepancy
+ * matches Java's bizarre `InputStream' protocol.)
+ */
+
+ var remain = len;
+ while (remain > 0) {
+ val n = in.skip(remain);
+
+ if (n > 0) { remain -= n; offset += n; }
+ else {
+ /* It's hard to work out whether this means we've hit EOF or not. It
+ * seems best to check. We must have at least one byte left to skip
+ * or we wouldn't have started this iteration, so try to read that.
+ * If that works, then there's more stuff available and skipping
+ * isn't working, so start to read buffers and discard them.
+ */
+
+ if (in.read() == -1) eoferr();
+ remain -= 1; offset += 1;
+
+ /* Ugh. So, buffers it is then. */
+ val buf = new Array[Byte]((remain min 4096).toInt);
+ while (remain >= buf.length) {
+ val n = (remain min buf.length).toInt;
+ read(buf, 0, n);
+ remain -= n;
+ }
+ }
+ }
+ }
+
+ private[TarFile] class Stream(end: Long, tok: Token) extends InputStream {
+ /* An input stream for a single archive entry's content. */
+
+ /* Claim the lock. If we're stale, this won't work. */
+ lock(tok);
+ private[this] var open = true;
+
+ private[this] def checkopen() {
+ /* Complain if the stream is closed. */
+
+ if (!lockp) throw new IllegalArgumentException("stream is closed");
+ }
+
+ override def read(): Int = {
+ /* Read one byte. Don't know why there isn't a default implementation
+ * of this.
+ */
+
+ checkopen();
+ if (offset >= end) -1
+ else {
+ val b = in.read();
+ if (b == -1) eoferr();
+ offset += 1;
+ b
+ }
+ }
+
+ override def read(buf: Array[Byte], start: Int, len: Int): Int = {
+ /* Read a block. */
+
+ checkopen();
+ if (offset >= end) -1
+ else {
+ var n = (len.toLong min (end - offset)).toInt;
+ tar.read(buf, start, n);
+ n
+ }
+ }
+
+ override def close() {
+ /* Release the lock. */
+
+ if (open) { unlock(); open = false; }
+ }
+ }
+
+ private[this] class Entry(val name: String, val size: Long,
+ val typ: Char, val mode: Int,
+ val mtime: Date,
+ val uid: Int, val gid: Int,
+ val link: String,
+ end: Long, tok: Token)
+ extends TarEntry{
+ /* See `TarEntry' for why we have this silliness. Most of the work is in
+ * the constructor above.
+ */
+
+ lazy val stream: InputStream = new Stream(end, tok);
+ }
+
+ /* Utilities for parsing archive-entry header blocks. */
+
+ private[this] def string(off: Int, len: Int): String = {
+ /* Parse a string from the block header. POSIX.1-2008 says that header
+ * fields should be ISO/IEC 646, but strange things can turn up
+ * especially in filenames. I'm going to translate strings according to
+ * the local character set, because that will map most easily if a
+ * program tries to write out files from the archive with their
+ * associated names.
+ */
+
+ /* First, find the null terminator, if there is one. Scala doesn't make
+ * this especially easy. Rustle up a view to limit the search.
+ */
+ val bview = hdr.view(off, off + len);
+ val n = bview.indexOf(0) match {
+ case -1 => len
+ case nul => nul
+ };
+
+ /* And then decode the relevant portion of the orignal buffer. */
+ val dec = Charset.defaultCharset.newDecoder;
+ val in = ByteBuffer.wrap(hdr, off, n);
+ dec.decode(in).toString
+ }
+
+ private[this] def number(off: Int, len: Int, max: Long): Long = {
+ /* Parse a number from the block header. POSIX.1-2008 says that numbers
+ * are in octal and terminated by space or nul.
+ */
+
+ var n = 0l; // accumulate the value
+ for (i <- off until off + len) {
+ val b = hdr(i);
+
+ /* See if we're done now. */
+ if (b == ' ' || b == 0) return n;
+ else if (b < '0' || b > '7')
+ throw new TarFormatError(s"bad octal digit (at ${offset + off + i})");
+
+ /* Convert to a digit. */
+ val m = b - '0';
+
+ /* Check for overflow -- without overflowing.
+ *
+ * Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n >
+ * 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a
+ * little fraught because Scala has the wrong semantics when dividing
+ * negative integers.
+ */
+ if (n > max/8 + (8 + max%8 - m)/8 - 1)
+ throw new TarFormatError(s"number out of range (at ${offset + off})");
+
+ /* Accumulate and go round again. */
+ n = 8*n + (b - '0');
+ }
+ unreachable;
+ }
+
+ override protected def fetch(): Option[TarEntry] = {
+ /* Collect the next archive header and return it as a file entry. */
+
+ /* Make sure that we can actually do this. */
+ withCleaner { clean =>
+ lock(); clean { unlock(); }
+
+ /* Skip ahead to the next header. */
+ skip(nexthdr - offset);
+
+ /* Read the header. The end of the archive is marked by two zero
+ * blocks, so the archive is broken if there isn't at least one here.
+ */
+ read(hdr, 0, 512);
+ }
+
+ /* If the block is entirely zero-filled then declare this file at an
+ * end. No good can come from checking the next block.
+ */
+ if (hdr.forall(_ == 0)) return None;
+
+ /* Verify the checksum. Pay attention because Java's bytes are
+ * (idiotically) signed.
+ */
+ var ck: Int = 8*' '; // pretend chksum field is spaces
+ for (i <- 0 until 148) ck += hdr(i)&0xff;
+ for (i <- 156 until 512) ck += hdr(i)&0xff;
+ val wantck = number(148, 8, 0x20000);
+ if (ck != wantck) {
+ throw new TarFormatError(
+ s"invalid checksum $ck /= $wantck (at $nexthdr)");
+ }
+
+ /* Fetch the `magic' and `version' fields. If this is a proper POSIX
+ * `ustar' file then special processing will apply.
+ */
+ val magic = string(257, 6);
+ val version = string(263, 2);
+ val posixp = magic == "ustar" && version == "00";
+
+ /* Figure out this entry's name. If this is a POSIX archive, then part
+ * of the name is stashed at the end of the header because of old, bad
+ * decisions. But don't look there unless we're sure because old GNU
+ * `tar' used that space for other things.
+ */
+ val name = {
+ val tail = string(0, 100);
+ if (!posixp || hdr(345) == 0) tail
+ else {
+ val prefix = string(345, 155);
+ prefix + '/' + tail
+ }
+ }
+
+ /* Read some other easy stuff. */
+ val mode = number(100, 8, 0xfff).toInt;
+ val uid = number(108, 8, Int.MaxValue).toInt;
+ val gid = number(116, 8, Int.MaxValue).toInt;
+ val typ = hdr(156).toChar;
+ val mtime = number(136, 12, Long.MaxValue);
+
+ /* The size is irrelevant, and possibly even misleading, for some entry
+ * types. We're not interested, for example, in systems where
+ * directories need to be preallocated.
+ */
+ val size = typ match {
+ case '1' | '2' | '3' | '4' | '5' | '6' => 0
+ case _ => number(124, 12, Long.MaxValue)
+ }
+
+ /* Maybe fetch the link name. */
+ val link = typ match {
+ case '1' | '2' => string(157, 100)
+ case _ => ""
+ }
+
+ /* Figure out where the next header ought to be. */
+ nexthdr = (offset + size + 511)& -512;
+
+ /* Return the finished archive entry. */
+ Some(new Entry(name, size, typ, mode,
+ new Date(1000*mtime), uid, gid, link,
+ offset + size, locktok));
+ }
+}
+
+/* Example:
+ *
+ * for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg)
+ * e withStream { in =>
+ * val h = java.security.MessageDigest.getInstance("SHA-256");
+ * for ((buf, n) <- in.blocks) h.update(b, 0, n);
+ * val hex = new String(h.digest flatMap { _.formatted("%02x") });
+ * println("s$hex ${e.name}");
+ * }
+ */
+
+/*----- That's all, folks -------------------------------------------------*/