3 * Extract data from `tar' archives
5 * (c) 2018 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of the Trivial IP Encryption (TrIPE) Android app.
12 * TrIPE is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your
15 * option) any later version.
17 * TrIPE is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
22 * You should have received a copy of the GNU General Public License
23 * along with TrIPE. If not, see <https://www.gnu.org/licenses/>.
26 package uk.org.distorted.tripe;
28 /*----- Imports -----------------------------------------------------------*/
30 import java.io.{Closeable, InputStream};
31 import java.nio.ByteBuffer;
32 import java.nio.charset.Charset;
33 import java.util.Date;
35 /*----- Main code ---------------------------------------------------------*/
37 class TarFormatError(msg: String) extends Exception(msg);
40 /* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit
41 * the trait inheritance circularity. So this is a cardboard cutout
45 /* Basic facts about the entry. */
55 /* Type predicates (intentionally like `FileInfo'). */
56 def isfifo: Boolean = typ == '6';
57 def ischr: Boolean = typ == '3';
58 def isdir: Boolean = typ == '5';
59 def isblk: Boolean = typ == '4';
60 def isreg: Boolean = typ match {
61 case 0 | '0' | '7' => true
64 def islnk: Boolean = typ == '2';
65 def issock: Boolean = false;
66 def ishardlink: Boolean = typ == '1';
68 def verbose: String = {
69 /* Encode information about this tar header as a string. */
71 val sb = new StringBuilder;
73 /* First, the type code. */
75 case 0 | '0' | '7' => '-'
85 /* Then the permissions bits. Ugh, the permissions bits. */
86 def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) {
87 sb += (if ((mode&r) != 0) 'r' else '-');
88 sb += (if ((mode&w) != 0) 'w' else '-');
89 sb += (if ((mode&s) != 0)
90 if ((mode&x) != 0) schar else Schar;
92 if ((mode&x) != 0) 'x' else '-');
94 perm(0x800, 0x100, 0x080, 0x040, 's', 'S');
95 perm(0x400, 0x020, 0x010, 0x008, 's', 'S');
96 perm(0x200, 0x004, 0x002, 0x001, 't', 'T');
98 /* And the rest, which is easy. */
99 sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT%<tT%<tz $name%s";
105 override def toString(): String = s"${getClass.getName}($verbose)";
107 def stream: InputStream;
108 def withStream[T](body: InputStream => T): T = {
111 finally { s.close(); }
115 class TarFile(in: InputStream)
116 extends LookaheadIterator[TarEntry] with Closeable { tar =>
118 /* Tokens are just objects, meaningful only for their identity. */
119 private[TarFile] class Token;
121 /* Some useful state. */
122 private[TarFile] var offset: Long = 0; // current byte offset
123 private[this] var lockp = false; // locked by open entry?
124 private[this] var locktok = new Token; // active lock token
125 private[this] var nexthdr: Long = 0; // byte offset of next header
126 private[this] val hdr = new Array[Byte](512); // header under consideration
128 /* Making sure we clean up properly. */
129 override def close() { in.close(); }
130 override protected def finalize() { super.finalize(); close(); }
132 private[this] def eoferr()
133 { throw new TarFormatError(s"unexpected EOF (at $offset)"); }
135 /* Locking machinery.
137 * We work from a primitive `InputStream' which we can't seek. From this,
138 * we must be able to extract file contents, as an `InputStream', and parse
139 * file headers. We'll be badly lost if we lose track of where we are in
142 * So, there's a lock, which can be held by at most one actor at a time:
143 * either the `TarFile' itself, while it's (hopefully) reading a header
144 * block, or by the `Stream' object which lets the caller read an
145 * individual entry's content. Furthermore, if we start activating the
146 * per-entry streams out of order, we'll get confused about where we're
147 * meant to be, so there's also a `token' which represents a participant's
148 * right to claim the lock. The `TarFile' itself has special privileges
149 * and doesn't need a token, but the per-entry streams do, and only the
150 * stream associated with the most recently-read header is allowed to claim
154 private[this] def lock() {
155 /* Claim exclusive use of the input stream. */
157 if (lockp) throw new IllegalArgumentException("tarfile lock still held");
161 private[TarFile] def lock(tok: Token) {
162 /* Claim exclusive use of the input stream, passing a token. */
165 throw new IllegalArgumentException("stale lock token");
169 private[TarFile] def unlock() {
170 /* Release the input stream so someone else can have a go. */
177 /* Doing I/O on the input stream.
179 * Our `Stream' object sneakily grabs single bytes from the input. Given
180 * the way Scala works, we can't prevent that, so roll with it.
183 private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) {
184 /* Read input data into the indicated region of the buffer. Short reads
185 * are diagnosed as errors. Advances the cursor.
189 val limit = start + len;
191 val n = in.read(buf, pos, limit - pos);
193 pos += n; offset += n;
197 private[TarFile] def skip(len: Long) {
198 /* Skip ahead LEN bytes in the archive. (The int/long discrepancy
199 * matches Java's bizarre `InputStream' protocol.)
204 val n = in.skip(remain);
206 if (n > 0) { remain -= n; offset += n; }
208 /* It's hard to work out whether this means we've hit EOF or not. It
209 * seems best to check. We must have at least one byte left to skip
210 * or we wouldn't have started this iteration, so try to read that.
211 * If that works, then there's more stuff available and skipping
212 * isn't working, so start to read buffers and discard them.
215 if (in.read() == -1) eoferr();
216 remain -= 1; offset += 1;
218 /* Ugh. So, buffers it is then. */
219 val buf = new Array[Byte]((remain min 4096).toInt);
220 while (remain >= buf.length) {
221 val n = (remain min buf.length).toInt;
229 private[TarFile] class Stream(end: Long, tok: Token) extends InputStream {
230 /* An input stream for a single archive entry's content. */
232 /* Claim the lock. If we're stale, this won't work. */
234 private[this] var open = true;
236 private[this] def checkopen() {
237 /* Complain if the stream is closed. */
239 if (!lockp) throw new IllegalArgumentException("stream is closed");
242 override def read(): Int = {
243 /* Read one byte. Don't know why there isn't a default implementation
248 if (offset >= end) -1
251 if (b == -1) eoferr();
257 override def read(buf: Array[Byte], start: Int, len: Int): Int = {
261 if (offset >= end) -1
263 var n = (len.toLong min (end - offset)).toInt;
264 tar.read(buf, start, n);
269 override def close() {
270 /* Release the lock. */
272 if (open) { unlock(); open = false; }
276 private[this] class Entry(val name: String, val size: Long,
277 val typ: Char, val mode: Int,
279 val uid: Int, val gid: Int,
281 end: Long, tok: Token)
283 /* See `TarEntry' for why we have this silliness. Most of the work is in
284 * the constructor above.
287 lazy val stream: InputStream = new Stream(end, tok);
290 /* Utilities for parsing archive-entry header blocks. */
292 private[this] def string(off: Int, len: Int): String = {
293 /* Parse a string from the block header. POSIX.1-2008 says that header
294 * fields should be ISO/IEC 646, but strange things can turn up
295 * especially in filenames. I'm going to translate strings according to
296 * the local character set, because that will map most easily if a
297 * program tries to write out files from the archive with their
301 /* First, find the null terminator, if there is one. Scala doesn't make
302 * this especially easy. Rustle up a view to limit the search.
304 val bview = hdr.view(off, off + len);
305 val n = bview.indexOf(0) match {
310 /* And then decode the relevant portion of the orignal buffer. */
311 val dec = Charset.defaultCharset.newDecoder;
312 val in = ByteBuffer.wrap(hdr, off, n);
313 dec.decode(in).toString
316 private[this] def number(off: Int, len: Int, max: Long): Long = {
317 /* Parse a number from the block header. POSIX.1-2008 says that numbers
318 * are in octal and terminated by space or nul.
321 var n = 0l; // accumulate the value
322 for (i <- off until off + len) {
325 /* See if we're done now. */
326 if (b == ' ' || b == 0) return n;
327 else if (b < '0' || b > '7')
328 throw new TarFormatError(s"bad octal digit (at ${offset + off + i})");
330 /* Convert to a digit. */
333 /* Check for overflow -- without overflowing.
335 * Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n >
336 * 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a
337 * little fraught because Scala has the wrong semantics when dividing
340 if (n > max/8 + (8 + max%8 - m)/8 - 1)
341 throw new TarFormatError(s"number out of range (at ${offset + off})");
343 /* Accumulate and go round again. */
349 override protected def fetch(): Option[TarEntry] = {
350 /* Collect the next archive header and return it as a file entry. */
352 /* Make sure that we can actually do this. */
353 withCleaner { clean =>
354 lock(); clean { unlock(); }
356 /* Skip ahead to the next header. */
357 skip(nexthdr - offset);
359 /* Read the header. The end of the archive is marked by two zero
360 * blocks, so the archive is broken if there isn't at least one here.
365 /* If the block is entirely zero-filled then declare this file at an
366 * end. No good can come from checking the next block.
368 if (hdr.forall(_ == 0)) return None;
370 /* Verify the checksum. Pay attention because Java's bytes are
371 * (idiotically) signed.
373 var ck: Int = 8*' '; // pretend chksum field is spaces
374 for (i <- 0 until 148) ck += hdr(i)&0xff;
375 for (i <- 156 until 512) ck += hdr(i)&0xff;
376 val wantck = number(148, 8, 0x20000);
378 throw new TarFormatError(
379 s"invalid checksum $ck /= $wantck (at $nexthdr)");
382 /* Fetch the `magic' and `version' fields. If this is a proper POSIX
383 * `ustar' file then special processing will apply.
385 val magic = string(257, 6);
386 val version = string(263, 2);
387 val posixp = magic == "ustar" && version == "00";
389 /* Figure out this entry's name. If this is a POSIX archive, then part
390 * of the name is stashed at the end of the header because of old, bad
391 * decisions. But don't look there unless we're sure because old GNU
392 * `tar' used that space for other things.
395 val tail = string(0, 100);
396 if (!posixp || hdr(345) == 0) tail
398 val prefix = string(345, 155);
403 /* Read some other easy stuff. */
404 val mode = number(100, 8, 0xfff).toInt;
405 val uid = number(108, 8, Int.MaxValue).toInt;
406 val gid = number(116, 8, Int.MaxValue).toInt;
407 val typ = hdr(156).toChar;
408 val mtime = number(136, 12, Long.MaxValue);
410 /* The size is irrelevant, and possibly even misleading, for some entry
411 * types. We're not interested, for example, in systems where
412 * directories need to be preallocated.
414 val size = typ match {
415 case '1' | '2' | '3' | '4' | '5' | '6' => 0
416 case _ => number(124, 12, Long.MaxValue)
419 /* Maybe fetch the link name. */
420 val link = typ match {
421 case '1' | '2' => string(157, 100)
425 /* Figure out where the next header ought to be. */
426 nexthdr = (offset + size + 511)& -512;
428 /* Return the finished archive entry. */
429 Some(new Entry(name, size, typ, mode,
430 new Date(1000*mtime), uid, gid, link,
431 offset + size, locktok));
437 * for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg)
438 * e withStream { in =>
439 * val h = java.security.MessageDigest.getInstance("SHA-256");
440 * for ((buf, n) <- in.blocks) h.update(b, 0, n);
441 * val hex = new String(h.digest flatMap { _.formatted("%02x") });
442 * println("s$hex ${e.name}");
446 /*----- That's all, folks -------------------------------------------------*/