Commit | Line | Data |
---|---|---|
c8292b34 MW |
1 | /* -*-scala-*- |
2 | * | |
3 | * Extract data from `tar' archives | |
4 | * | |
5 | * (c) 2018 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the Trivial IP Encryption (TrIPE) Android app. | |
11 | * | |
12 | * TrIPE is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU General Public License as published by the Free | |
14 | * Software Foundation; either version 3 of the License, or (at your | |
15 | * option) any later version. | |
16 | * | |
17 | * TrIPE is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
20 | * for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with TrIPE. If not, see <https://www.gnu.org/licenses/>. | |
24 | */ | |
25 | ||
26 | package uk.org.distorted.tripe; | |
27 | ||
28 | /*----- Imports -----------------------------------------------------------*/ | |
29 | ||
30 | import java.io.{Closeable, InputStream}; | |
31 | import java.nio.ByteBuffer; | |
32 | import java.nio.charset.Charset; | |
33 | import java.util.Date; | |
34 | ||
35 | /*----- Main code ---------------------------------------------------------*/ | |
36 | ||
37 | class TarFormatError(msg: String) extends Exception(msg); | |
38 | ||
39 | trait TarEntry { | |
40 | /* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit | |
41 | * the trait inheritance circularity. So this is a cardboard cutout | |
42 | * version of `Entry'. | |
43 | */ | |
44 | ||
45 | /* Basic facts about the entry. */ | |
46 | def name: String; | |
47 | def size: Long; | |
48 | def typ: Char; | |
49 | def mode: Int; | |
50 | def mtime: Date; | |
51 | def uid: Int; | |
52 | def gid: Int; | |
53 | def link: String; | |
54 | ||
55 | /* Type predicates (intentionally like `FileInfo'). */ | |
56 | def isfifo: Boolean = typ == '6'; | |
57 | def ischr: Boolean = typ == '3'; | |
58 | def isdir: Boolean = typ == '5'; | |
59 | def isblk: Boolean = typ == '4'; | |
60 | def isreg: Boolean = typ match { | |
61 | case 0 | '0' | '7' => true | |
62 | case _ => false | |
63 | } | |
64 | def islnk: Boolean = typ == '2'; | |
65 | def issock: Boolean = false; | |
66 | def ishardlink: Boolean = typ == '1'; | |
67 | ||
68 | def verbose: String = { | |
69 | /* Encode information about this tar header as a string. */ | |
70 | ||
71 | val sb = new StringBuilder; | |
72 | ||
73 | /* First, the type code. */ | |
74 | sb += (typ match { | |
75 | case 0 | '0' | '7' => '-' | |
76 | case '1' => 'L' | |
77 | case '2' => 'l' | |
78 | case '3' => 'c' | |
79 | case '4' => 'b' | |
80 | case '5' => 'd' | |
81 | case '6' => '|' | |
82 | case _ => '?' | |
83 | }) | |
84 | ||
85 | /* Then the permissions bits. Ugh, the permissions bits. */ | |
86 | def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) { | |
87 | sb += (if ((mode&r) != 0) 'r' else '-'); | |
88 | sb += (if ((mode&w) != 0) 'w' else '-'); | |
89 | sb += (if ((mode&s) != 0) | |
90 | if ((mode&x) != 0) schar else Schar; | |
91 | else | |
92 | if ((mode&x) != 0) 'x' else '-'); | |
93 | } | |
94 | perm(0x800, 0x100, 0x080, 0x040, 's', 'S'); | |
95 | perm(0x400, 0x020, 0x010, 0x008, 's', 'S'); | |
96 | perm(0x200, 0x004, 0x002, 0x001, 't', 'T'); | |
97 | ||
98 | /* And the rest, which is easy. */ | |
99 | sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT%<tT%<tz $name%s"; | |
100 | ||
101 | /* Done. */ | |
102 | sb.result | |
103 | } | |
104 | ||
105 | override def toString(): String = s"${getClass.getName}($verbose)"; | |
106 | ||
107 | def stream: InputStream; | |
108 | def withStream[T](body: InputStream => T): T = { | |
109 | val s = stream; | |
110 | try { body(s) } | |
111 | finally { s.close(); } | |
112 | } | |
113 | } | |
114 | ||
115 | class TarFile(in: InputStream) | |
116 | extends LookaheadIterator[TarEntry] with Closeable { tar => | |
117 | ||
118 | /* Tokens are just objects, meaningful only for their identity. */ | |
119 | private[TarFile] class Token; | |
120 | ||
121 | /* Some useful state. */ | |
122 | private[TarFile] var offset: Long = 0; // current byte offset | |
123 | private[this] var lockp = false; // locked by open entry? | |
124 | private[this] var locktok = new Token; // active lock token | |
125 | private[this] var nexthdr: Long = 0; // byte offset of next header | |
126 | private[this] val hdr = new Array[Byte](512); // header under consideration | |
127 | ||
128 | /* Making sure we clean up properly. */ | |
129 | override def close() { in.close(); } | |
130 | override protected def finalize() { super.finalize(); close(); } | |
131 | ||
132 | private[this] def eoferr() | |
133 | { throw new TarFormatError(s"unexpected EOF (at $offset)"); } | |
134 | ||
135 | /* Locking machinery. | |
136 | * | |
137 | * We work from a primitive `InputStream' which we can't seek. From this, | |
138 | * we must be able to extract file contents, as an `InputStream', and parse | |
139 | * file headers. We'll be badly lost if we lose track of where we are in | |
140 | * the archive. | |
141 | * | |
142 | * So, there's a lock, which can be held by at most one actor at a time: | |
143 | * either the `TarFile' itself, while it's (hopefully) reading a header | |
144 | * block, or by the `Stream' object which lets the caller read an | |
145 | * individual entry's content. Furthermore, if we start activating the | |
146 | * per-entry streams out of order, we'll get confused about where we're | |
147 | * meant to be, so there's also a `token' which represents a participant's | |
148 | * right to claim the lock. The `TarFile' itself has special privileges | |
149 | * and doesn't need a token, but the per-entry streams do, and only the | |
150 | * stream associated with the most recently-read header is allowed to claim | |
151 | * the lock. | |
152 | */ | |
153 | ||
154 | private[this] def lock() { | |
155 | /* Claim exclusive use of the input stream. */ | |
156 | ||
157 | if (lockp) throw new IllegalArgumentException("tarfile lock still held"); | |
158 | lockp = true; | |
159 | } | |
160 | ||
161 | private[TarFile] def lock(tok: Token) { | |
162 | /* Claim exclusive use of the input stream, passing a token. */ | |
163 | ||
164 | if (tok ne locktok) | |
165 | throw new IllegalArgumentException("stale lock token"); | |
166 | lock(); | |
167 | } | |
168 | ||
169 | private[TarFile] def unlock() { | |
170 | /* Release the input stream so someone else can have a go. */ | |
171 | ||
172 | assert(lockp); | |
173 | lockp = false; | |
174 | locktok = new Token; | |
175 | } | |
176 | ||
177 | /* Doing I/O on the input stream. | |
178 | * | |
179 | * Our `Stream' object sneakily grabs single bytes from the input. Given | |
180 | * the way Scala works, we can't prevent that, so roll with it. | |
181 | */ | |
182 | ||
183 | private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) { | |
184 | /* Read input data into the indicated region of the buffer. Short reads | |
185 | * are diagnosed as errors. Advances the cursor. | |
186 | */ | |
187 | ||
188 | var pos = start; | |
189 | val limit = start + len; | |
190 | while (pos < len) { | |
191 | val n = in.read(buf, pos, limit - pos); | |
192 | if (n < 0) eoferr(); | |
193 | pos += n; offset += n; | |
194 | } | |
195 | } | |
196 | ||
197 | private[TarFile] def skip(len: Long) { | |
198 | /* Skip ahead LEN bytes in the archive. (The int/long discrepancy | |
199 | * matches Java's bizarre `InputStream' protocol.) | |
200 | */ | |
201 | ||
202 | var remain = len; | |
203 | while (remain > 0) { | |
204 | val n = in.skip(remain); | |
205 | ||
206 | if (n > 0) { remain -= n; offset += n; } | |
207 | else { | |
208 | /* It's hard to work out whether this means we've hit EOF or not. It | |
209 | * seems best to check. We must have at least one byte left to skip | |
210 | * or we wouldn't have started this iteration, so try to read that. | |
211 | * If that works, then there's more stuff available and skipping | |
212 | * isn't working, so start to read buffers and discard them. | |
213 | */ | |
214 | ||
215 | if (in.read() == -1) eoferr(); | |
216 | remain -= 1; offset += 1; | |
217 | ||
218 | /* Ugh. So, buffers it is then. */ | |
219 | val buf = new Array[Byte]((remain min 4096).toInt); | |
220 | while (remain >= buf.length) { | |
221 | val n = (remain min buf.length).toInt; | |
222 | read(buf, 0, n); | |
223 | remain -= n; | |
224 | } | |
225 | } | |
226 | } | |
227 | } | |
228 | ||
229 | private[TarFile] class Stream(end: Long, tok: Token) extends InputStream { | |
230 | /* An input stream for a single archive entry's content. */ | |
231 | ||
232 | /* Claim the lock. If we're stale, this won't work. */ | |
233 | lock(tok); | |
234 | private[this] var open = true; | |
235 | ||
236 | private[this] def checkopen() { | |
237 | /* Complain if the stream is closed. */ | |
238 | ||
239 | if (!lockp) throw new IllegalArgumentException("stream is closed"); | |
240 | } | |
241 | ||
242 | override def read(): Int = { | |
243 | /* Read one byte. Don't know why there isn't a default implementation | |
244 | * of this. | |
245 | */ | |
246 | ||
247 | checkopen(); | |
248 | if (offset >= end) -1 | |
249 | else { | |
250 | val b = in.read(); | |
251 | if (b == -1) eoferr(); | |
252 | offset += 1; | |
253 | b | |
254 | } | |
255 | } | |
256 | ||
257 | override def read(buf: Array[Byte], start: Int, len: Int): Int = { | |
258 | /* Read a block. */ | |
259 | ||
260 | checkopen(); | |
261 | if (offset >= end) -1 | |
262 | else { | |
263 | var n = (len.toLong min (end - offset)).toInt; | |
264 | tar.read(buf, start, n); | |
265 | n | |
266 | } | |
267 | } | |
268 | ||
269 | override def close() { | |
270 | /* Release the lock. */ | |
271 | ||
272 | if (open) { unlock(); open = false; } | |
273 | } | |
274 | } | |
275 | ||
276 | private[this] class Entry(val name: String, val size: Long, | |
277 | val typ: Char, val mode: Int, | |
278 | val mtime: Date, | |
279 | val uid: Int, val gid: Int, | |
280 | val link: String, | |
281 | end: Long, tok: Token) | |
282 | extends TarEntry{ | |
283 | /* See `TarEntry' for why we have this silliness. Most of the work is in | |
284 | * the constructor above. | |
285 | */ | |
286 | ||
287 | lazy val stream: InputStream = new Stream(end, tok); | |
288 | } | |
289 | ||
290 | /* Utilities for parsing archive-entry header blocks. */ | |
291 | ||
292 | private[this] def string(off: Int, len: Int): String = { | |
293 | /* Parse a string from the block header. POSIX.1-2008 says that header | |
294 | * fields should be ISO/IEC 646, but strange things can turn up | |
295 | * especially in filenames. I'm going to translate strings according to | |
296 | * the local character set, because that will map most easily if a | |
297 | * program tries to write out files from the archive with their | |
298 | * associated names. | |
299 | */ | |
300 | ||
301 | /* First, find the null terminator, if there is one. Scala doesn't make | |
302 | * this especially easy. Rustle up a view to limit the search. | |
303 | */ | |
304 | val bview = hdr.view(off, off + len); | |
305 | val n = bview.indexOf(0) match { | |
306 | case -1 => len | |
307 | case nul => nul | |
308 | }; | |
309 | ||
310 | /* And then decode the relevant portion of the orignal buffer. */ | |
311 | val dec = Charset.defaultCharset.newDecoder; | |
312 | val in = ByteBuffer.wrap(hdr, off, n); | |
313 | dec.decode(in).toString | |
314 | } | |
315 | ||
316 | private[this] def number(off: Int, len: Int, max: Long): Long = { | |
317 | /* Parse a number from the block header. POSIX.1-2008 says that numbers | |
318 | * are in octal and terminated by space or nul. | |
319 | */ | |
320 | ||
321 | var n = 0l; // accumulate the value | |
322 | for (i <- off until off + len) { | |
323 | val b = hdr(i); | |
324 | ||
325 | /* See if we're done now. */ | |
326 | if (b == ' ' || b == 0) return n; | |
327 | else if (b < '0' || b > '7') | |
328 | throw new TarFormatError(s"bad octal digit (at ${offset + off + i})"); | |
329 | ||
330 | /* Convert to a digit. */ | |
331 | val m = b - '0'; | |
332 | ||
333 | /* Check for overflow -- without overflowing. | |
334 | * | |
335 | * Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n > | |
336 | * 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a | |
337 | * little fraught because Scala has the wrong semantics when dividing | |
338 | * negative integers. | |
339 | */ | |
340 | if (n > max/8 + (8 + max%8 - m)/8 - 1) | |
341 | throw new TarFormatError(s"number out of range (at ${offset + off})"); | |
342 | ||
343 | /* Accumulate and go round again. */ | |
344 | n = 8*n + (b - '0'); | |
345 | } | |
346 | unreachable; | |
347 | } | |
348 | ||
349 | override protected def fetch(): Option[TarEntry] = { | |
350 | /* Collect the next archive header and return it as a file entry. */ | |
351 | ||
352 | /* Make sure that we can actually do this. */ | |
353 | withCleaner { clean => | |
354 | lock(); clean { unlock(); } | |
355 | ||
356 | /* Skip ahead to the next header. */ | |
357 | skip(nexthdr - offset); | |
358 | ||
359 | /* Read the header. The end of the archive is marked by two zero | |
360 | * blocks, so the archive is broken if there isn't at least one here. | |
361 | */ | |
362 | read(hdr, 0, 512); | |
363 | } | |
364 | ||
365 | /* If the block is entirely zero-filled then declare this file at an | |
366 | * end. No good can come from checking the next block. | |
367 | */ | |
368 | if (hdr.forall(_ == 0)) return None; | |
369 | ||
370 | /* Verify the checksum. Pay attention because Java's bytes are | |
371 | * (idiotically) signed. | |
372 | */ | |
373 | var ck: Int = 8*' '; // pretend chksum field is spaces | |
374 | for (i <- 0 until 148) ck += hdr(i)&0xff; | |
375 | for (i <- 156 until 512) ck += hdr(i)&0xff; | |
376 | val wantck = number(148, 8, 0x20000); | |
377 | if (ck != wantck) { | |
378 | throw new TarFormatError( | |
379 | s"invalid checksum $ck /= $wantck (at $nexthdr)"); | |
380 | } | |
381 | ||
382 | /* Fetch the `magic' and `version' fields. If this is a proper POSIX | |
383 | * `ustar' file then special processing will apply. | |
384 | */ | |
385 | val magic = string(257, 6); | |
386 | val version = string(263, 2); | |
387 | val posixp = magic == "ustar" && version == "00"; | |
388 | ||
389 | /* Figure out this entry's name. If this is a POSIX archive, then part | |
390 | * of the name is stashed at the end of the header because of old, bad | |
391 | * decisions. But don't look there unless we're sure because old GNU | |
392 | * `tar' used that space for other things. | |
393 | */ | |
394 | val name = { | |
395 | val tail = string(0, 100); | |
396 | if (!posixp || hdr(345) == 0) tail | |
397 | else { | |
398 | val prefix = string(345, 155); | |
399 | prefix + '/' + tail | |
400 | } | |
401 | } | |
402 | ||
403 | /* Read some other easy stuff. */ | |
404 | val mode = number(100, 8, 0xfff).toInt; | |
405 | val uid = number(108, 8, Int.MaxValue).toInt; | |
406 | val gid = number(116, 8, Int.MaxValue).toInt; | |
407 | val typ = hdr(156).toChar; | |
408 | val mtime = number(136, 12, Long.MaxValue); | |
409 | ||
410 | /* The size is irrelevant, and possibly even misleading, for some entry | |
411 | * types. We're not interested, for example, in systems where | |
412 | * directories need to be preallocated. | |
413 | */ | |
414 | val size = typ match { | |
415 | case '1' | '2' | '3' | '4' | '5' | '6' => 0 | |
416 | case _ => number(124, 12, Long.MaxValue) | |
417 | } | |
418 | ||
419 | /* Maybe fetch the link name. */ | |
420 | val link = typ match { | |
421 | case '1' | '2' => string(157, 100) | |
422 | case _ => "" | |
423 | } | |
424 | ||
425 | /* Figure out where the next header ought to be. */ | |
426 | nexthdr = (offset + size + 511)& -512; | |
427 | ||
428 | /* Return the finished archive entry. */ | |
429 | Some(new Entry(name, size, typ, mode, | |
430 | new Date(1000*mtime), uid, gid, link, | |
431 | offset + size, locktok)); | |
432 | } | |
433 | } | |
434 | ||
435 | /* Example: | |
436 | * | |
437 | * for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg) | |
438 | * e withStream { in => | |
439 | * val h = java.security.MessageDigest.getInstance("SHA-256"); | |
440 | * for ((buf, n) <- in.blocks) h.update(b, 0, n); | |
441 | * val hex = new String(h.digest flatMap { _.formatted("%02x") }); | |
442 | * println("s$hex ${e.name}"); | |
443 | * } | |
444 | */ | |
445 | ||
446 | /*----- That's all, folks -------------------------------------------------*/ |