Commit | Line | Data |
---|---|---|
c8292b34 MW |
1 | /* -*-scala-*- |
2 | * | |
3 | * Extract data from `tar' archives | |
4 | * | |
5 | * (c) 2018 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the Trivial IP Encryption (TrIPE) Android app. | |
11 | * | |
12 | * TrIPE is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU General Public License as published by the Free | |
14 | * Software Foundation; either version 3 of the License, or (at your | |
15 | * option) any later version. | |
16 | * | |
17 | * TrIPE is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
20 | * for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with TrIPE. If not, see <https://www.gnu.org/licenses/>. | |
24 | */ | |
25 | ||
26 | package uk.org.distorted.tripe; | |
27 | ||
28 | /*----- Imports -----------------------------------------------------------*/ | |
29 | ||
30 | import java.io.{Closeable, InputStream}; | |
31 | import java.nio.ByteBuffer; | |
32 | import java.nio.charset.Charset; | |
33 | import java.util.Date; | |
34 | ||
a5ec891a MW |
35 | import sys.FileInfo; |
36 | import sys.FileInfo.{Value, FIFO, CHR, DIR, BLK, REG, LNK, HDLNK, UNK}; | |
37 | ||
0157de02 MW |
38 | import Implicits.truish; |
39 | ||
c8292b34 MW |
40 | /*----- Main code ---------------------------------------------------------*/ |
41 | ||
42 | class TarFormatError(msg: String) extends Exception(msg); | |
43 | ||
44 | trait TarEntry { | |
45 | /* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit | |
46 | * the trait inheritance circularity. So this is a cardboard cutout | |
47 | * version of `Entry'. | |
48 | */ | |
49 | ||
50 | /* Basic facts about the entry. */ | |
51 | def name: String; | |
52 | def size: Long; | |
a5ec891a | 53 | def rawtyp: Char; |
c8292b34 MW |
54 | def mode: Int; |
55 | def mtime: Date; | |
56 | def uid: Int; | |
57 | def gid: Int; | |
58 | def link: String; | |
59 | ||
60 | /* Type predicates (intentionally like `FileInfo'). */ | |
a5ec891a MW |
61 | def isfifo: Boolean = rawtyp == '6'; |
62 | def ischr: Boolean = rawtyp == '3'; | |
63 | def isdir: Boolean = rawtyp == '5'; | |
64 | def isblk: Boolean = rawtyp == '4'; | |
65 | def isreg: Boolean = rawtyp match { | |
c8292b34 MW |
66 | case 0 | '0' | '7' => true |
67 | case _ => false | |
68 | } | |
a5ec891a | 69 | def islnk: Boolean = rawtyp == '2'; |
c8292b34 | 70 | def issock: Boolean = false; |
a5ec891a MW |
71 | def ishardlink: Boolean = rawtyp == '1'; |
72 | ||
73 | def typ: FileInfo.Value = rawtyp match { | |
74 | case 0 | '0' | '7' => REG | |
75 | case '1' => HDLNK | |
76 | case '2' => LNK | |
77 | case '3' => CHR | |
78 | case '4' => BLK | |
79 | case '5' => DIR | |
80 | case '6' => FIFO | |
81 | case _ => UNK | |
82 | } | |
c8292b34 MW |
83 | |
84 | def verbose: String = { | |
85 | /* Encode information about this tar header as a string. */ | |
86 | ||
87 | val sb = new StringBuilder; | |
88 | ||
89 | /* First, the type code. */ | |
a5ec891a | 90 | sb += (rawtyp match { |
c8292b34 MW |
91 | case 0 | '0' | '7' => '-' |
92 | case '1' => 'L' | |
93 | case '2' => 'l' | |
94 | case '3' => 'c' | |
95 | case '4' => 'b' | |
96 | case '5' => 'd' | |
97 | case '6' => '|' | |
98 | case _ => '?' | |
99 | }) | |
100 | ||
101 | /* Then the permissions bits. Ugh, the permissions bits. */ | |
102 | def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) { | |
0157de02 MW |
103 | sb += (if (mode&r) 'r' else '-'); |
104 | sb += (if (mode&w) 'w' else '-'); | |
105 | sb += (if (mode&s) { if (mode&x) schar else Schar; } | |
106 | else { if (mode&x) 'x' else '-' }); | |
c8292b34 MW |
107 | } |
108 | perm(0x800, 0x100, 0x080, 0x040, 's', 'S'); | |
109 | perm(0x400, 0x020, 0x010, 0x008, 's', 'S'); | |
110 | perm(0x200, 0x004, 0x002, 0x001, 't', 'T'); | |
111 | ||
112 | /* And the rest, which is easy. */ | |
113 | sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT%<tT%<tz $name%s"; | |
114 | ||
115 | /* Done. */ | |
116 | sb.result | |
117 | } | |
118 | ||
119 | override def toString(): String = s"${getClass.getName}($verbose)"; | |
120 | ||
121 | def stream: InputStream; | |
122 | def withStream[T](body: InputStream => T): T = { | |
123 | val s = stream; | |
124 | try { body(s) } | |
125 | finally { s.close(); } | |
126 | } | |
127 | } | |
128 | ||
129 | class TarFile(in: InputStream) | |
130 | extends LookaheadIterator[TarEntry] with Closeable { tar => | |
131 | ||
132 | /* Tokens are just objects, meaningful only for their identity. */ | |
133 | private[TarFile] class Token; | |
134 | ||
135 | /* Some useful state. */ | |
136 | private[TarFile] var offset: Long = 0; // current byte offset | |
137 | private[this] var lockp = false; // locked by open entry? | |
138 | private[this] var locktok = new Token; // active lock token | |
139 | private[this] var nexthdr: Long = 0; // byte offset of next header | |
140 | private[this] val hdr = new Array[Byte](512); // header under consideration | |
141 | ||
142 | /* Making sure we clean up properly. */ | |
143 | override def close() { in.close(); } | |
144 | override protected def finalize() { super.finalize(); close(); } | |
145 | ||
146 | private[this] def eoferr() | |
147 | { throw new TarFormatError(s"unexpected EOF (at $offset)"); } | |
148 | ||
149 | /* Locking machinery. | |
150 | * | |
151 | * We work from a primitive `InputStream' which we can't seek. From this, | |
152 | * we must be able to extract file contents, as an `InputStream', and parse | |
153 | * file headers. We'll be badly lost if we lose track of where we are in | |
154 | * the archive. | |
155 | * | |
156 | * So, there's a lock, which can be held by at most one actor at a time: | |
157 | * either the `TarFile' itself, while it's (hopefully) reading a header | |
158 | * block, or by the `Stream' object which lets the caller read an | |
159 | * individual entry's content. Furthermore, if we start activating the | |
160 | * per-entry streams out of order, we'll get confused about where we're | |
161 | * meant to be, so there's also a `token' which represents a participant's | |
162 | * right to claim the lock. The `TarFile' itself has special privileges | |
163 | * and doesn't need a token, but the per-entry streams do, and only the | |
164 | * stream associated with the most recently-read header is allowed to claim | |
165 | * the lock. | |
166 | */ | |
167 | ||
168 | private[this] def lock() { | |
169 | /* Claim exclusive use of the input stream. */ | |
170 | ||
171 | if (lockp) throw new IllegalArgumentException("tarfile lock still held"); | |
172 | lockp = true; | |
173 | } | |
174 | ||
175 | private[TarFile] def lock(tok: Token) { | |
176 | /* Claim exclusive use of the input stream, passing a token. */ | |
177 | ||
178 | if (tok ne locktok) | |
179 | throw new IllegalArgumentException("stale lock token"); | |
180 | lock(); | |
181 | } | |
182 | ||
183 | private[TarFile] def unlock() { | |
184 | /* Release the input stream so someone else can have a go. */ | |
185 | ||
186 | assert(lockp); | |
187 | lockp = false; | |
188 | locktok = new Token; | |
189 | } | |
190 | ||
191 | /* Doing I/O on the input stream. | |
192 | * | |
193 | * Our `Stream' object sneakily grabs single bytes from the input. Given | |
194 | * the way Scala works, we can't prevent that, so roll with it. | |
195 | */ | |
196 | ||
197 | private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) { | |
198 | /* Read input data into the indicated region of the buffer. Short reads | |
199 | * are diagnosed as errors. Advances the cursor. | |
200 | */ | |
201 | ||
202 | var pos = start; | |
203 | val limit = start + len; | |
204 | while (pos < len) { | |
205 | val n = in.read(buf, pos, limit - pos); | |
206 | if (n < 0) eoferr(); | |
207 | pos += n; offset += n; | |
208 | } | |
209 | } | |
210 | ||
211 | private[TarFile] def skip(len: Long) { | |
212 | /* Skip ahead LEN bytes in the archive. (The int/long discrepancy | |
213 | * matches Java's bizarre `InputStream' protocol.) | |
214 | */ | |
215 | ||
216 | var remain = len; | |
217 | while (remain > 0) { | |
218 | val n = in.skip(remain); | |
219 | ||
220 | if (n > 0) { remain -= n; offset += n; } | |
221 | else { | |
222 | /* It's hard to work out whether this means we've hit EOF or not. It | |
223 | * seems best to check. We must have at least one byte left to skip | |
224 | * or we wouldn't have started this iteration, so try to read that. | |
225 | * If that works, then there's more stuff available and skipping | |
226 | * isn't working, so start to read buffers and discard them. | |
227 | */ | |
228 | ||
229 | if (in.read() == -1) eoferr(); | |
230 | remain -= 1; offset += 1; | |
231 | ||
232 | /* Ugh. So, buffers it is then. */ | |
233 | val buf = new Array[Byte]((remain min 4096).toInt); | |
234 | while (remain >= buf.length) { | |
235 | val n = (remain min buf.length).toInt; | |
236 | read(buf, 0, n); | |
237 | remain -= n; | |
238 | } | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | private[TarFile] class Stream(end: Long, tok: Token) extends InputStream { | |
244 | /* An input stream for a single archive entry's content. */ | |
245 | ||
246 | /* Claim the lock. If we're stale, this won't work. */ | |
247 | lock(tok); | |
248 | private[this] var open = true; | |
249 | ||
250 | private[this] def checkopen() { | |
251 | /* Complain if the stream is closed. */ | |
252 | ||
253 | if (!lockp) throw new IllegalArgumentException("stream is closed"); | |
254 | } | |
255 | ||
256 | override def read(): Int = { | |
257 | /* Read one byte. Don't know why there isn't a default implementation | |
258 | * of this. | |
259 | */ | |
260 | ||
261 | checkopen(); | |
262 | if (offset >= end) -1 | |
263 | else { | |
264 | val b = in.read(); | |
265 | if (b == -1) eoferr(); | |
266 | offset += 1; | |
267 | b | |
268 | } | |
269 | } | |
270 | ||
271 | override def read(buf: Array[Byte], start: Int, len: Int): Int = { | |
272 | /* Read a block. */ | |
273 | ||
274 | checkopen(); | |
275 | if (offset >= end) -1 | |
276 | else { | |
277 | var n = (len.toLong min (end - offset)).toInt; | |
278 | tar.read(buf, start, n); | |
279 | n | |
280 | } | |
281 | } | |
282 | ||
283 | override def close() { | |
284 | /* Release the lock. */ | |
285 | ||
286 | if (open) { unlock(); open = false; } | |
287 | } | |
288 | } | |
289 | ||
290 | private[this] class Entry(val name: String, val size: Long, | |
a5ec891a | 291 | val rawtyp: Char, val mode: Int, |
c8292b34 MW |
292 | val mtime: Date, |
293 | val uid: Int, val gid: Int, | |
294 | val link: String, | |
295 | end: Long, tok: Token) | |
296 | extends TarEntry{ | |
297 | /* See `TarEntry' for why we have this silliness. Most of the work is in | |
298 | * the constructor above. | |
299 | */ | |
300 | ||
301 | lazy val stream: InputStream = new Stream(end, tok); | |
302 | } | |
303 | ||
304 | /* Utilities for parsing archive-entry header blocks. */ | |
305 | ||
306 | private[this] def string(off: Int, len: Int): String = { | |
307 | /* Parse a string from the block header. POSIX.1-2008 says that header | |
308 | * fields should be ISO/IEC 646, but strange things can turn up | |
309 | * especially in filenames. I'm going to translate strings according to | |
310 | * the local character set, because that will map most easily if a | |
311 | * program tries to write out files from the archive with their | |
312 | * associated names. | |
313 | */ | |
314 | ||
315 | /* First, find the null terminator, if there is one. Scala doesn't make | |
316 | * this especially easy. Rustle up a view to limit the search. | |
317 | */ | |
318 | val bview = hdr.view(off, off + len); | |
319 | val n = bview.indexOf(0) match { | |
320 | case -1 => len | |
321 | case nul => nul | |
322 | }; | |
323 | ||
324 | /* And then decode the relevant portion of the orignal buffer. */ | |
325 | val dec = Charset.defaultCharset.newDecoder; | |
326 | val in = ByteBuffer.wrap(hdr, off, n); | |
327 | dec.decode(in).toString | |
328 | } | |
329 | ||
330 | private[this] def number(off: Int, len: Int, max: Long): Long = { | |
331 | /* Parse a number from the block header. POSIX.1-2008 says that numbers | |
332 | * are in octal and terminated by space or nul. | |
333 | */ | |
334 | ||
335 | var n = 0l; // accumulate the value | |
336 | for (i <- off until off + len) { | |
337 | val b = hdr(i); | |
338 | ||
339 | /* See if we're done now. */ | |
0157de02 | 340 | if (!b || b == ' ') return n; |
c8292b34 MW |
341 | else if (b < '0' || b > '7') |
342 | throw new TarFormatError(s"bad octal digit (at ${offset + off + i})"); | |
343 | ||
344 | /* Convert to a digit. */ | |
345 | val m = b - '0'; | |
346 | ||
347 | /* Check for overflow -- without overflowing. | |
348 | * | |
349 | * Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n > | |
350 | * 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a | |
351 | * little fraught because Scala has the wrong semantics when dividing | |
352 | * negative integers. | |
353 | */ | |
354 | if (n > max/8 + (8 + max%8 - m)/8 - 1) | |
355 | throw new TarFormatError(s"number out of range (at ${offset + off})"); | |
356 | ||
357 | /* Accumulate and go round again. */ | |
358 | n = 8*n + (b - '0'); | |
359 | } | |
360 | unreachable; | |
361 | } | |
362 | ||
363 | override protected def fetch(): Option[TarEntry] = { | |
364 | /* Collect the next archive header and return it as a file entry. */ | |
365 | ||
366 | /* Make sure that we can actually do this. */ | |
367 | withCleaner { clean => | |
368 | lock(); clean { unlock(); } | |
369 | ||
370 | /* Skip ahead to the next header. */ | |
371 | skip(nexthdr - offset); | |
372 | ||
373 | /* Read the header. The end of the archive is marked by two zero | |
374 | * blocks, so the archive is broken if there isn't at least one here. | |
375 | */ | |
376 | read(hdr, 0, 512); | |
377 | } | |
378 | ||
379 | /* If the block is entirely zero-filled then declare this file at an | |
380 | * end. No good can come from checking the next block. | |
381 | */ | |
382 | if (hdr.forall(_ == 0)) return None; | |
383 | ||
384 | /* Verify the checksum. Pay attention because Java's bytes are | |
385 | * (idiotically) signed. | |
386 | */ | |
387 | var ck: Int = 8*' '; // pretend chksum field is spaces | |
388 | for (i <- 0 until 148) ck += hdr(i)&0xff; | |
389 | for (i <- 156 until 512) ck += hdr(i)&0xff; | |
390 | val wantck = number(148, 8, 0x20000); | |
391 | if (ck != wantck) { | |
392 | throw new TarFormatError( | |
393 | s"invalid checksum $ck /= $wantck (at $nexthdr)"); | |
394 | } | |
395 | ||
396 | /* Fetch the `magic' and `version' fields. If this is a proper POSIX | |
397 | * `ustar' file then special processing will apply. | |
398 | */ | |
399 | val magic = string(257, 6); | |
400 | val version = string(263, 2); | |
401 | val posixp = magic == "ustar" && version == "00"; | |
402 | ||
403 | /* Figure out this entry's name. If this is a POSIX archive, then part | |
404 | * of the name is stashed at the end of the header because of old, bad | |
405 | * decisions. But don't look there unless we're sure because old GNU | |
406 | * `tar' used that space for other things. | |
407 | */ | |
408 | val name = { | |
409 | val tail = string(0, 100); | |
0157de02 | 410 | if (!posixp || !hdr(345)) tail |
c8292b34 MW |
411 | else { |
412 | val prefix = string(345, 155); | |
413 | prefix + '/' + tail | |
414 | } | |
415 | } | |
416 | ||
417 | /* Read some other easy stuff. */ | |
418 | val mode = number(100, 8, 0xfff).toInt; | |
419 | val uid = number(108, 8, Int.MaxValue).toInt; | |
420 | val gid = number(116, 8, Int.MaxValue).toInt; | |
421 | val typ = hdr(156).toChar; | |
422 | val mtime = number(136, 12, Long.MaxValue); | |
423 | ||
424 | /* The size is irrelevant, and possibly even misleading, for some entry | |
425 | * types. We're not interested, for example, in systems where | |
426 | * directories need to be preallocated. | |
427 | */ | |
428 | val size = typ match { | |
429 | case '1' | '2' | '3' | '4' | '5' | '6' => 0 | |
430 | case _ => number(124, 12, Long.MaxValue) | |
431 | } | |
432 | ||
433 | /* Maybe fetch the link name. */ | |
434 | val link = typ match { | |
435 | case '1' | '2' => string(157, 100) | |
436 | case _ => "" | |
437 | } | |
438 | ||
439 | /* Figure out where the next header ought to be. */ | |
440 | nexthdr = (offset + size + 511)& -512; | |
441 | ||
442 | /* Return the finished archive entry. */ | |
443 | Some(new Entry(name, size, typ, mode, | |
444 | new Date(1000*mtime), uid, gid, link, | |
445 | offset + size, locktok)); | |
446 | } | |
447 | } | |
448 | ||
449 | /* Example: | |
450 | * | |
451 | * for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg) | |
452 | * e withStream { in => | |
453 | * val h = java.security.MessageDigest.getInstance("SHA-256"); | |
454 | * for ((buf, n) <- in.blocks) h.update(b, 0, n); | |
455 | * val hex = new String(h.digest flatMap { _.formatted("%02x") }); | |
456 | * println("s$hex ${e.name}"); | |
457 | * } | |
458 | */ | |
459 | ||
460 | /*----- That's all, folks -------------------------------------------------*/ |