Skip to main content

tar/
builder.rs

1use std::fs;
2use std::io;
3use std::io::prelude::*;
4use std::path::Path;
5use std::str;
6
7use crate::header::BLOCK_SIZE;
8use crate::header::GNU_SPARSE_HEADERS_COUNT;
9use crate::header::{path2bytes, HeaderMode};
10use crate::GnuExtSparseHeader;
11use crate::{other, EntryType, Header};
12
13/// A structure for building archives
14///
15/// This structure has methods for building up an archive from scratch into any
16/// arbitrary writer.
17pub struct Builder<W: Write> {
18    options: BuilderOptions,
19    finished: bool,
20    obj: Option<W>,
21}
22
23#[derive(Clone, Copy)]
24struct BuilderOptions {
25    mode: HeaderMode,
26    follow: bool,
27    sparse: bool,
28}
29
30impl<W: Write> Builder<W> {
31    /// Create a new archive builder with the underlying object as the
32    /// destination of all data written. The builder will use
33    /// `HeaderMode::Complete` by default.
34    pub fn new(obj: W) -> Builder<W> {
35        Builder {
36            options: BuilderOptions {
37                mode: HeaderMode::Complete,
38                follow: true,
39                sparse: true,
40            },
41            finished: false,
42            obj: Some(obj),
43        }
44    }
45
46    /// Changes the HeaderMode that will be used when reading fs Metadata for
47    /// methods that implicitly read metadata for an input Path. Notably, this
48    /// does _not_ apply to `append(Header)`.
49    pub fn mode(&mut self, mode: HeaderMode) {
50        self.options.mode = mode;
51    }
52
53    /// Follow symlinks, archiving the contents of the file they point to rather
54    /// than adding a symlink to the archive. Defaults to true.
55    ///
56    /// When true, it exhibits the same behavior as GNU `tar` command's
57    /// `--dereference` or `-h` options <https://man7.org/linux/man-pages/man1/tar.1.html>.
58    pub fn follow_symlinks(&mut self, follow: bool) {
59        self.options.follow = follow;
60    }
61
62    /// Handle sparse files efficiently, if supported by the underlying
63    /// filesystem. When true, sparse file information is read from disk and
64    /// empty segments are omitted from the archive. Defaults to true.
65    pub fn sparse(&mut self, sparse: bool) {
66        self.options.sparse = sparse;
67    }
68
69    /// Gets shared reference to the underlying object.
70    pub fn get_ref(&self) -> &W {
71        self.obj.as_ref().unwrap()
72    }
73
74    /// Gets mutable reference to the underlying object.
75    ///
76    /// Note that care must be taken while writing to the underlying
77    /// object. But, e.g. `get_mut().flush()` is claimed to be safe and
78    /// useful in the situations when one needs to be ensured that
79    /// tar entry was flushed to the disk.
80    pub fn get_mut(&mut self) -> &mut W {
81        self.obj.as_mut().unwrap()
82    }
83
84    /// Unwrap this archive, returning the underlying object.
85    ///
86    /// This function will finish writing the archive if the `finish` function
87    /// hasn't yet been called, returning any I/O error which happens during
88    /// that operation.
89    pub fn into_inner(mut self) -> io::Result<W> {
90        if !self.finished {
91            self.finish()?;
92        }
93        Ok(self.obj.take().unwrap())
94    }
95
96    /// Adds a new entry to this archive.
97    ///
98    /// This function will append the header specified, followed by contents of
99    /// the stream specified by `data`. To produce a valid archive the `size`
100    /// field of `header` must be the same as the length of the stream that's
101    /// being written. Additionally the checksum for the header should have been
102    /// set via the `set_cksum` method.
103    ///
104    /// Note that this will not attempt to seek the archive to a valid position,
105    /// so if the archive is in the middle of a read or some other similar
106    /// operation then this may corrupt the archive.
107    ///
108    /// Also note that after all entries have been written to an archive the
109    /// `finish` function needs to be called to finish writing the archive.
110    ///
111    /// # Errors
112    ///
113    /// This function will return an error for any intermittent I/O error which
114    /// occurs when either reading or writing.
115    ///
116    /// # Examples
117    ///
118    /// ```
119    /// use tar::{Builder, Header};
120    ///
121    /// let mut header = Header::new_gnu();
122    /// header.set_path("foo").unwrap();
123    /// header.set_size(4);
124    /// header.set_cksum();
125    ///
126    /// let mut data: &[u8] = &[1, 2, 3, 4];
127    ///
128    /// let mut ar = Builder::new(Vec::new());
129    /// ar.append(&header, data).unwrap();
130    /// let data = ar.into_inner().unwrap();
131    /// ```
132    pub fn append<R: Read>(&mut self, header: &Header, mut data: R) -> io::Result<()> {
133        append(self.get_mut(), header, &mut data)
134    }
135
136    /// Adds a new entry to this archive with the specified path.
137    ///
138    /// This function will set the specified path in the given header, which may
139    /// require appending a GNU long-name extension entry to the archive first.
140    /// The checksum for the header will be automatically updated via the
141    /// `set_cksum` method after setting the path. No other metadata in the
142    /// header will be modified.
143    ///
144    /// Then it will append the header, followed by contents of the stream
145    /// specified by `data`. To produce a valid archive the `size` field of
146    /// `header` must be the same as the length of the stream that's being
147    /// written.
148    ///
149    /// Note that this will not attempt to seek the archive to a valid position,
150    /// so if the archive is in the middle of a read or some other similar
151    /// operation then this may corrupt the archive.
152    ///
153    /// Also note that after all entries have been written to an archive the
154    /// `finish` function needs to be called to finish writing the archive.
155    ///
156    /// # Errors
157    ///
158    /// This function will return an error for any intermittent I/O error which
159    /// occurs when either reading or writing.
160    ///
161    /// # Examples
162    ///
163    /// ```
164    /// use tar::{Builder, Header};
165    ///
166    /// let mut header = Header::new_gnu();
167    /// header.set_size(4);
168    /// header.set_cksum();
169    ///
170    /// let mut data: &[u8] = &[1, 2, 3, 4];
171    ///
172    /// let mut ar = Builder::new(Vec::new());
173    /// ar.append_data(&mut header, "really/long/path/to/foo", data).unwrap();
174    /// let data = ar.into_inner().unwrap();
175    /// ```
176    pub fn append_data<P: AsRef<Path>, R: Read>(
177        &mut self,
178        header: &mut Header,
179        path: P,
180        data: R,
181    ) -> io::Result<()> {
182        prepare_header_path(self.get_mut(), header, path.as_ref())?;
183        header.set_cksum();
184        self.append(header, data)
185    }
186
187    /// Adds a new entry to this archive and returns an [`EntryWriter`] for
188    /// adding its contents.
189    ///
190    /// This function is similar to [`Self::append_data`] but returns a
191    /// [`io::Write`] implementation instead of taking data as a parameter.
192    ///
193    /// Similar constraints around the position of the archive and completion
194    /// apply as with [`Self::append_data`]. It requires the underlying writer
195    /// to implement [`Seek`] to update the header after writing the data.
196    ///
197    /// # Errors
198    ///
199    /// This function will return an error for any intermittent I/O error which
200    /// occurs when either reading or writing.
201    ///
202    /// # Examples
203    ///
204    /// ```
205    /// use std::io::Cursor;
206    /// use std::io::Write as _;
207    /// use tar::{Builder, Header};
208    ///
209    /// let mut header = Header::new_gnu();
210    ///
211    /// let mut ar = Builder::new(Cursor::new(Vec::new()));
212    /// let mut entry = ar.append_writer(&mut header, "hi.txt").unwrap();
213    /// entry.write_all(b"Hello, ").unwrap();
214    /// entry.write_all(b"world!\n").unwrap();
215    /// entry.finish().unwrap();
216    /// ```
217    pub fn append_writer<'a, P: AsRef<Path>>(
218        &'a mut self,
219        header: &'a mut Header,
220        path: P,
221    ) -> io::Result<EntryWriter<'a>>
222    where
223        W: Seek,
224    {
225        EntryWriter::start(self.get_mut(), header, path.as_ref())
226    }
227
228    /// Adds a new link (symbolic or hard) entry to this archive with the specified path and target.
229    ///
230    /// This function is similar to [`Self::append_data`] which supports long filenames,
231    /// but also supports long link targets using GNU extensions if necessary.
232    /// You must set the entry type to either [`EntryType::Link`] or [`EntryType::Symlink`].
233    /// The `set_cksum` method will be invoked after setting the path. No other metadata in the
234    /// header will be modified.
235    ///
236    /// If you are intending to use GNU extensions, you must use this method over calling
237    /// [`Header::set_link_name`] because that function will fail on long links.
238    ///
239    /// Similar constraints around the position of the archive and completion
240    /// apply as with [`Self::append_data`].
241    ///
242    /// # Errors
243    ///
244    /// This function will return an error for any intermittent I/O error which
245    /// occurs when either reading or writing.
246    ///
247    /// # Examples
248    ///
249    /// ```
250    /// use tar::{Builder, Header, EntryType};
251    ///
252    /// let mut ar = Builder::new(Vec::new());
253    /// let mut header = Header::new_gnu();
254    /// header.set_username("foo");
255    /// header.set_entry_type(EntryType::Symlink);
256    /// header.set_size(0);
257    /// ar.append_link(&mut header, "really/long/path/to/foo", "other/really/long/target").unwrap();
258    /// let data = ar.into_inner().unwrap();
259    /// ```
260    pub fn append_link<P: AsRef<Path>, T: AsRef<Path>>(
261        &mut self,
262        header: &mut Header,
263        path: P,
264        target: T,
265    ) -> io::Result<()> {
266        self._append_link(header, path.as_ref(), target.as_ref())
267    }
268
269    fn _append_link(&mut self, header: &mut Header, path: &Path, target: &Path) -> io::Result<()> {
270        prepare_header_path(self.get_mut(), header, path)?;
271        prepare_header_link(self.get_mut(), header, target)?;
272        header.set_cksum();
273        self.append(header, std::io::empty())
274    }
275
276    /// Adds a file on the local filesystem to this archive.
277    ///
278    /// This function will open the file specified by `path` and insert the file
279    /// into the archive with the appropriate metadata set, returning any I/O
280    /// error which occurs while writing. The path name for the file inside of
281    /// this archive will be the same as `path`, and it is required that the
282    /// path is a relative path.
283    ///
284    /// Note that this will not attempt to seek the archive to a valid position,
285    /// so if the archive is in the middle of a read or some other similar
286    /// operation then this may corrupt the archive.
287    ///
288    /// Also note that after all files have been written to an archive the
289    /// `finish` function needs to be called to finish writing the archive.
290    ///
291    /// # Examples
292    ///
293    /// ```no_run
294    /// use tar::Builder;
295    ///
296    /// let mut ar = Builder::new(Vec::new());
297    ///
298    /// ar.append_path("foo/bar.txt").unwrap();
299    /// ```
300    pub fn append_path<P: AsRef<Path>>(&mut self, path: P) -> io::Result<()> {
301        let options = self.options;
302        append_path_with_name(self.get_mut(), path.as_ref(), None, options)
303    }
304
305    /// Adds a file on the local filesystem to this archive under another name.
306    ///
307    /// This function will open the file specified by `path` and insert the file
308    /// into the archive as `name` with appropriate metadata set, returning any
309    /// I/O error which occurs while writing. The path name for the file inside
310    /// of this archive will be `name` is required to be a relative path.
311    ///
312    /// Note that this will not attempt to seek the archive to a valid position,
313    /// so if the archive is in the middle of a read or some other similar
314    /// operation then this may corrupt the archive.
315    ///
316    /// Note if the `path` is a directory. This will just add an entry to the archive,
317    /// rather than contents of the directory.
318    ///
319    /// Also note that after all files have been written to an archive the
320    /// `finish` function needs to be called to finish writing the archive.
321    ///
322    /// # Examples
323    ///
324    /// ```no_run
325    /// use tar::Builder;
326    ///
327    /// let mut ar = Builder::new(Vec::new());
328    ///
329    /// // Insert the local file "foo/bar.txt" in the archive but with the name
330    /// // "bar/foo.txt".
331    /// ar.append_path_with_name("foo/bar.txt", "bar/foo.txt").unwrap();
332    /// ```
333    pub fn append_path_with_name<P: AsRef<Path>, N: AsRef<Path>>(
334        &mut self,
335        path: P,
336        name: N,
337    ) -> io::Result<()> {
338        let options = self.options;
339        append_path_with_name(self.get_mut(), path.as_ref(), Some(name.as_ref()), options)
340    }
341
342    /// Adds a file to this archive with the given path as the name of the file
343    /// in the archive.
344    ///
345    /// This will use the metadata of `file` to populate a `Header`, and it will
346    /// then append the file to the archive with the name `path`.
347    ///
348    /// Note that this will not attempt to seek the archive to a valid position,
349    /// so if the archive is in the middle of a read or some other similar
350    /// operation then this may corrupt the archive.
351    ///
352    /// Also note that after all files have been written to an archive the
353    /// `finish` function needs to be called to finish writing the archive.
354    ///
355    /// # Examples
356    ///
357    /// ```no_run
358    /// use std::fs::File;
359    /// use tar::Builder;
360    ///
361    /// let mut ar = Builder::new(Vec::new());
362    ///
363    /// // Open the file at one location, but insert it into the archive with a
364    /// // different name.
365    /// let mut f = File::open("foo/bar/baz.txt").unwrap();
366    /// ar.append_file("bar/baz.txt", &mut f).unwrap();
367    /// ```
368    pub fn append_file<P: AsRef<Path>>(&mut self, path: P, file: &mut fs::File) -> io::Result<()> {
369        let options = self.options;
370        append_file(self.get_mut(), path.as_ref(), file, options)
371    }
372
373    /// Adds a directory to this archive with the given path as the name of the
374    /// directory in the archive.
375    ///
376    /// This will use `stat` to populate a `Header`, and it will then append the
377    /// directory to the archive with the name `path`.
378    ///
379    /// Note that this will not attempt to seek the archive to a valid position,
380    /// so if the archive is in the middle of a read or some other similar
381    /// operation then this may corrupt the archive.
382    ///
383    /// Note this will not add the contents of the directory to the archive.
384    /// See `append_dir_all` for recursively adding the contents of the directory.
385    ///
386    /// Also note that after all files have been written to an archive the
387    /// `finish` function needs to be called to finish writing the archive.
388    ///
389    /// # Examples
390    ///
391    /// ```
392    /// use std::fs;
393    /// use tar::Builder;
394    ///
395    /// let mut ar = Builder::new(Vec::new());
396    ///
397    /// // Use the directory at one location, but insert it into the archive
398    /// // with a different name.
399    /// ar.append_dir("bardir", ".").unwrap();
400    /// ```
401    pub fn append_dir<P, Q>(&mut self, path: P, src_path: Q) -> io::Result<()>
402    where
403        P: AsRef<Path>,
404        Q: AsRef<Path>,
405    {
406        let options = self.options;
407        append_dir(self.get_mut(), path.as_ref(), src_path.as_ref(), options)
408    }
409
410    /// Adds a directory and all of its contents (recursively) to this archive
411    /// with the given path as the name of the directory in the archive.
412    ///
413    /// Note that this will not attempt to seek the archive to a valid position,
414    /// so if the archive is in the middle of a read or some other similar
415    /// operation then this may corrupt the archive.
416    ///
417    /// Also note that after all files have been written to an archive the
418    /// `finish` or `into_inner` function needs to be called to finish
419    /// writing the archive.
420    ///
421    /// # Examples
422    ///
423    /// ```
424    /// use std::fs;
425    /// use tar::Builder;
426    ///
427    /// let mut ar = Builder::new(Vec::new());
428    ///
429    /// // Use the directory at one location ("."), but insert it into the archive
430    /// // with a different name ("bardir").
431    /// ar.append_dir_all("bardir", ".").unwrap();
432    /// ar.finish().unwrap();
433    /// ```
434    ///
435    /// Use `append_dir_all` with an empty string as the first path argument to
436    /// create an archive from all files in a directory without renaming.
437    ///
438    /// ```
439    /// use std::fs;
440    /// use std::path::PathBuf;
441    /// use tar::{Archive, Builder};
442    ///
443    /// let tmpdir = tempfile::tempdir().unwrap();
444    /// let path = tmpdir.path();
445    /// fs::write(path.join("a.txt"), b"hello").unwrap();
446    /// fs::write(path.join("b.txt"), b"world").unwrap();
447    ///
448    /// // Create a tarball from the files in the directory
449    /// let mut ar = Builder::new(Vec::new());
450    /// ar.append_dir_all("", path).unwrap();
451    ///
452    /// // List files in the archive
453    /// let archive = ar.into_inner().unwrap();
454    /// let archived_files = Archive::new(archive.as_slice())
455    ///     .entries()
456    ///     .unwrap()
457    ///     .map(|entry| entry.unwrap().path().unwrap().into_owned())
458    ///     .collect::<Vec<_>>();
459    ///
460    /// assert!(archived_files.contains(&PathBuf::from("a.txt")));
461    /// assert!(archived_files.contains(&PathBuf::from("b.txt")));
462    /// ```
463    pub fn append_dir_all<P, Q>(&mut self, path: P, src_path: Q) -> io::Result<()>
464    where
465        P: AsRef<Path>,
466        Q: AsRef<Path>,
467    {
468        let options = self.options;
469        append_dir_all(self.get_mut(), path.as_ref(), src_path.as_ref(), options)
470    }
471
472    /// Finish writing this archive, emitting the termination sections.
473    ///
474    /// This function should only be called when the archive has been written
475    /// entirely and if an I/O error happens the underlying object still needs
476    /// to be acquired.
477    ///
478    /// In most situations the `into_inner` method should be preferred.
479    pub fn finish(&mut self) -> io::Result<()> {
480        if self.finished {
481            return Ok(());
482        }
483        self.finished = true;
484        self.get_mut().write_all(&[0; 1024])
485    }
486}
487
488trait SeekWrite: Write + Seek {
489    fn as_write(&mut self) -> &mut dyn Write;
490}
491
492impl<T: Write + Seek> SeekWrite for T {
493    fn as_write(&mut self) -> &mut dyn Write {
494        self
495    }
496}
497
498/// A writer for a single entry in a tar archive.
499///
500/// This struct is returned by [`Builder::append_writer`] and provides a
501/// [`Write`] implementation for adding content to an archive entry.
502///
503/// After writing all data to the entry, it must be finalized either by
504/// explicitly calling [`EntryWriter::finish`] or by letting it drop.
505pub struct EntryWriter<'a> {
506    // NOTE: Do not add any fields here which require Drop!
507    // See the comment below in finish().
508    obj: &'a mut dyn SeekWrite,
509    header: &'a mut Header,
510    written: u64,
511}
512
513impl EntryWriter<'_> {
514    fn start<'a>(
515        obj: &'a mut dyn SeekWrite,
516        header: &'a mut Header,
517        path: &Path,
518    ) -> io::Result<EntryWriter<'a>> {
519        prepare_header_path(obj.as_write(), header, path)?;
520
521        // Reserve space for header, will be overwritten once data is written.
522        obj.write_all([0u8; BLOCK_SIZE as usize].as_ref())?;
523
524        Ok(EntryWriter {
525            obj,
526            header,
527            written: 0,
528        })
529    }
530
531    /// Finish writing the current entry in the archive.
532    pub fn finish(self) -> io::Result<()> {
533        // NOTE: This is an optimization for "fallible destructuring".
534        // We want finish() to return an error, but we also need to invoke
535        // cleanup in our Drop handler, which will run unconditionally
536        // and try to do the same work.
537        // By using ManuallyDrop, we suppress that drop. However, this would
538        // be a memory leak if we ever had any struct members which required
539        // Drop - which we don't right now.
540        // But if we ever gain one, we will need to change to use e.g. Option<>
541        // around some of the fields or have a `bool finished` etc.
542        let mut this = std::mem::ManuallyDrop::new(self);
543        this.do_finish()
544    }
545
546    fn do_finish(&mut self) -> io::Result<()> {
547        // Pad with zeros if necessary.
548        let buf = [0u8; BLOCK_SIZE as usize];
549        let remaining = BLOCK_SIZE.wrapping_sub(self.written) % BLOCK_SIZE;
550        self.obj.write_all(&buf[..remaining as usize])?;
551        let written = (self.written + remaining) as i64;
552
553        // Seek back to the header position.
554        self.obj
555            .seek(io::SeekFrom::Current(-written - BLOCK_SIZE as i64))?;
556
557        self.header.set_size(self.written);
558        self.header.set_cksum();
559        self.obj.write_all(self.header.as_bytes())?;
560
561        // Seek forward to restore the position.
562        self.obj.seek(io::SeekFrom::Current(written))?;
563
564        Ok(())
565    }
566}
567
568impl Write for EntryWriter<'_> {
569    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
570        let len = self.obj.write(buf)?;
571        self.written += len as u64;
572        Ok(len)
573    }
574
575    fn flush(&mut self) -> io::Result<()> {
576        self.obj.flush()
577    }
578}
579
580impl Drop for EntryWriter<'_> {
581    fn drop(&mut self) {
582        let _ = self.do_finish();
583    }
584}
585
586fn append(mut dst: &mut dyn Write, header: &Header, mut data: &mut dyn Read) -> io::Result<()> {
587    dst.write_all(header.as_bytes())?;
588    let len = io::copy(&mut data, &mut dst)?;
589    pad_zeroes(&mut dst, len)?;
590    Ok(())
591}
592
593fn pad_zeroes(dst: &mut dyn Write, len: u64) -> io::Result<()> {
594    let buf = [0; BLOCK_SIZE as usize];
595    let remaining = BLOCK_SIZE - (len % BLOCK_SIZE);
596    if remaining < BLOCK_SIZE {
597        dst.write_all(&buf[..remaining as usize])?;
598    }
599    Ok(())
600}
601
602fn append_path_with_name(
603    dst: &mut dyn Write,
604    path: &Path,
605    name: Option<&Path>,
606    options: BuilderOptions,
607) -> io::Result<()> {
608    let stat = if options.follow {
609        fs::metadata(path).map_err(|err| {
610            io::Error::new(
611                err.kind(),
612                format!("{} when getting metadata for {}", err, path.display()),
613            )
614        })?
615    } else {
616        fs::symlink_metadata(path).map_err(|err| {
617            io::Error::new(
618                err.kind(),
619                format!("{} when getting metadata for {}", err, path.display()),
620            )
621        })?
622    };
623    let ar_name = name.unwrap_or(path);
624    if stat.is_file() {
625        append_file(dst, ar_name, &mut fs::File::open(path)?, options)
626    } else if stat.is_dir() {
627        append_fs(dst, ar_name, &stat, options.mode, None)
628    } else if stat.file_type().is_symlink() {
629        let link_name = fs::read_link(path)?;
630        append_fs(dst, ar_name, &stat, options.mode, Some(&link_name))
631    } else {
632        #[cfg(unix)]
633        {
634            append_special(dst, path, &stat, options.mode)
635        }
636        #[cfg(not(unix))]
637        {
638            Err(other(&format!("{} has unknown file type", path.display())))
639        }
640    }
641}
642
643#[cfg(unix)]
644fn append_special(
645    dst: &mut dyn Write,
646    path: &Path,
647    stat: &fs::Metadata,
648    mode: HeaderMode,
649) -> io::Result<()> {
650    use ::std::os::unix::fs::{FileTypeExt, MetadataExt};
651
652    let file_type = stat.file_type();
653    let entry_type;
654    if file_type.is_socket() {
655        // sockets can't be archived
656        return Err(other(&format!(
657            "{}: socket can not be archived",
658            path.display()
659        )));
660    } else if file_type.is_fifo() {
661        entry_type = EntryType::Fifo;
662    } else if file_type.is_char_device() {
663        entry_type = EntryType::Char;
664    } else if file_type.is_block_device() {
665        entry_type = EntryType::Block;
666    } else {
667        return Err(other(&format!("{} has unknown file type", path.display())));
668    }
669
670    let mut header = Header::new_gnu();
671    header.set_metadata_in_mode(stat, mode);
672    prepare_header_path(dst, &mut header, path)?;
673
674    header.set_entry_type(entry_type);
675    let dev_id = stat.rdev();
676    let dev_major = ((dev_id >> 32) & 0xffff_f000) | ((dev_id >> 8) & 0x0000_0fff);
677    let dev_minor = ((dev_id >> 12) & 0xffff_ff00) | ((dev_id) & 0x0000_00ff);
678    header.set_device_major(dev_major as u32)?;
679    header.set_device_minor(dev_minor as u32)?;
680
681    header.set_cksum();
682    dst.write_all(header.as_bytes())?;
683
684    Ok(())
685}
686
687fn append_file(
688    dst: &mut dyn Write,
689    path: &Path,
690    file: &mut fs::File,
691    options: BuilderOptions,
692) -> io::Result<()> {
693    let stat = file.metadata()?;
694    let mut header = Header::new_gnu();
695
696    prepare_header_path(dst, &mut header, path)?;
697    header.set_metadata_in_mode(&stat, options.mode);
698    let sparse_entries = if options.sparse {
699        prepare_header_sparse(file, &stat, &mut header)?
700    } else {
701        None
702    };
703    header.set_cksum();
704    dst.write_all(header.as_bytes())?;
705
706    if let Some(sparse_entries) = sparse_entries {
707        append_extended_sparse_headers(dst, &sparse_entries)?;
708        for entry in sparse_entries.entries {
709            file.seek(io::SeekFrom::Start(entry.offset))?;
710            io::copy(&mut file.take(entry.num_bytes), dst)?;
711        }
712        pad_zeroes(dst, sparse_entries.on_disk_size)?;
713    } else {
714        let len = io::copy(file, dst)?;
715        pad_zeroes(dst, len)?;
716    }
717
718    Ok(())
719}
720
721fn append_dir(
722    dst: &mut dyn Write,
723    path: &Path,
724    src_path: &Path,
725    options: BuilderOptions,
726) -> io::Result<()> {
727    let stat = fs::metadata(src_path)?;
728    append_fs(dst, path, &stat, options.mode, None)
729}
730
731fn prepare_header(size: u64, entry_type: u8) -> Header {
732    let mut header = Header::new_gnu();
733    let name = b"././@LongLink";
734    header.as_gnu_mut().unwrap().name[..name.len()].clone_from_slice(&name[..]);
735    header.set_mode(0o644);
736    header.set_uid(0);
737    header.set_gid(0);
738    header.set_mtime(0);
739    // + 1 to be compliant with GNU tar
740    header.set_size(size + 1);
741    header.set_entry_type(EntryType::new(entry_type));
742    header.set_cksum();
743    header
744}
745
746fn prepare_header_path(dst: &mut dyn Write, header: &mut Header, path: &Path) -> io::Result<()> {
747    // Try to encode the path directly in the header, but if it ends up not
748    // working (probably because it's too long) then try to use the GNU-specific
749    // long name extension by emitting an entry which indicates that it's the
750    // filename.
751    if let Err(e) = header.set_path(path) {
752        let data = path2bytes(path)?;
753        let max = header.as_old().name.len();
754        // Since `e` isn't specific enough to let us know the path is indeed too
755        // long, verify it first before using the extension.
756        if data.len() < max {
757            return Err(e);
758        }
759        // Truncate the path to store in the header we're about to emit to
760        // ensure we've got something at least mentioned. Note that we use
761        // `str`-encoding to be compatible with Windows, but in general the
762        // entry in the header itself shouldn't matter too much since extraction
763        // doesn't look at it.
764        //
765        // Validate the truncated path BEFORE writing the long-name extension
766        // to the stream. If validation fails after writing, the orphaned
767        // extension entry corrupts subsequent archive entries.
768        let truncated = match str::from_utf8(&data[..max]) {
769            Ok(s) => s,
770            Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
771        };
772        header.set_truncated_path_for_gnu_header(truncated)?;
773
774        let header2 = prepare_header(data.len() as u64, b'L');
775        // null-terminated string
776        let mut data2 = data.chain(io::repeat(0).take(1));
777        append(dst, &header2, &mut data2)?;
778    }
779    Ok(())
780}
781
782fn prepare_header_link(
783    dst: &mut dyn Write,
784    header: &mut Header,
785    link_name: &Path,
786) -> io::Result<()> {
787    // Same as previous function but for linkname
788    if let Err(e) = header.set_link_name(link_name) {
789        let data = path2bytes(link_name)?;
790        if data.len() < header.as_old().linkname.len() {
791            return Err(e);
792        }
793        let header2 = prepare_header(data.len() as u64, b'K');
794        let mut data2 = data.chain(io::repeat(0).take(1));
795        append(dst, &header2, &mut data2)?;
796    }
797    Ok(())
798}
799
800fn prepare_header_sparse(
801    file: &mut fs::File,
802    stat: &fs::Metadata,
803    header: &mut Header,
804) -> io::Result<Option<SparseEntries>> {
805    let entries = match find_sparse_entries(file, stat)? {
806        Some(entries) => entries,
807        _ => return Ok(None),
808    };
809
810    header.set_entry_type(EntryType::GNUSparse);
811    header.set_size(entries.on_disk_size);
812
813    // Write the first 4 (GNU_SPARSE_HEADERS_COUNT) entries to the given header.
814    // The remaining entries will be written as subsequent extended headers. See
815    // https://www.gnu.org/software/tar/manual/html_section/Sparse-Formats.html#Old-GNU-Format
816    // for details on the format.
817    let gnu_header = &mut header.as_gnu_mut().unwrap();
818    gnu_header.set_real_size(entries.size());
819
820    for (entry, header_entry) in std::iter::zip(&entries.entries, &mut gnu_header.sparse) {
821        header_entry.set_offset(entry.offset);
822        header_entry.set_length(entry.num_bytes);
823    }
824    gnu_header.set_is_extended(entries.entries.len() > gnu_header.sparse.len());
825
826    Ok(Some(entries))
827}
828
829/// Write extra sparse headers into `dst` for those entries that did not fit in the main header.
830fn append_extended_sparse_headers(dst: &mut dyn Write, entries: &SparseEntries) -> io::Result<()> {
831    // The first `GNU_SPARSE_HEADERS_COUNT` entries are written to the main header, so skip them.
832    let mut it = entries
833        .entries
834        .iter()
835        .skip(GNU_SPARSE_HEADERS_COUNT)
836        .peekable();
837
838    // Each GnuExtSparseHeader can hold up to fixed number of sparse entries (21).
839    // So we pack entries into multiple headers if necessary.
840    while it.peek().is_some() {
841        let mut ext_header = GnuExtSparseHeader::new();
842        for header_entry in ext_header.sparse.iter_mut() {
843            if let Some(entry) = it.next() {
844                header_entry.set_offset(entry.offset);
845                header_entry.set_length(entry.num_bytes);
846            } else {
847                break;
848            }
849        }
850        ext_header.set_is_extended(it.peek().is_some());
851        dst.write_all(ext_header.as_bytes())?;
852    }
853
854    Ok(())
855}
856
857fn append_fs(
858    dst: &mut dyn Write,
859    path: &Path,
860    meta: &fs::Metadata,
861    mode: HeaderMode,
862    link_name: Option<&Path>,
863) -> io::Result<()> {
864    let mut header = Header::new_gnu();
865
866    prepare_header_path(dst, &mut header, path)?;
867    header.set_metadata_in_mode(meta, mode);
868    if let Some(link_name) = link_name {
869        prepare_header_link(dst, &mut header, link_name)?;
870    }
871    header.set_cksum();
872    dst.write_all(header.as_bytes())
873}
874
875fn append_dir_all(
876    dst: &mut dyn Write,
877    path: &Path,
878    src_path: &Path,
879    options: BuilderOptions,
880) -> io::Result<()> {
881    let mut stack = vec![(src_path.to_path_buf(), true, false)];
882    while let Some((src, is_dir, is_symlink)) = stack.pop() {
883        let dest = path.join(src.strip_prefix(src_path).unwrap());
884        // In case of a symlink pointing to a directory, is_dir is false, but src.is_dir() will return true
885        if is_dir || (is_symlink && options.follow && src.is_dir()) {
886            for entry in fs::read_dir(&src)? {
887                let entry = entry?;
888                let file_type = entry.file_type()?;
889                stack.push((entry.path(), file_type.is_dir(), file_type.is_symlink()));
890            }
891            if dest != Path::new("") {
892                append_dir(dst, &dest, &src, options)?;
893            }
894        } else if !options.follow && is_symlink {
895            let stat = fs::symlink_metadata(&src)?;
896            let link_name = fs::read_link(&src)?;
897            append_fs(dst, &dest, &stat, options.mode, Some(&link_name))?;
898        } else {
899            #[cfg(unix)]
900            {
901                let stat = fs::metadata(&src)?;
902                if !stat.is_file() {
903                    append_special(dst, &dest, &stat, options.mode)?;
904                    continue;
905                }
906            }
907            append_file(dst, &dest, &mut fs::File::open(src)?, options)?;
908        }
909    }
910    Ok(())
911}
912
913#[derive(Debug, Clone, PartialEq, Eq)]
914struct SparseEntries {
915    entries: Vec<SparseEntry>,
916    on_disk_size: u64,
917}
918
919impl SparseEntries {
920    fn size(&self) -> u64 {
921        self.entries.last().map_or(0, |e| e.offset + e.num_bytes)
922    }
923}
924
925#[derive(Debug, Copy, Clone, PartialEq, Eq)]
926struct SparseEntry {
927    offset: u64,
928    num_bytes: u64,
929}
930
931/// Find sparse entries in a file. Returns:
932/// * `Ok(Some(_))` if the file is sparse.
933/// * `Ok(None)` if the file is not sparse, or if the file system does not support sparse files.
934/// * `Err(_)` if an error occurred. The lack of support for sparse files is not
935///   considered an error. It might return an error if the file is modified
936///   while reading.
937fn find_sparse_entries(
938    file: &mut fs::File,
939    stat: &fs::Metadata,
940) -> io::Result<Option<SparseEntries>> {
941    #[cfg(not(any(target_os = "android", target_os = "freebsd", target_os = "linux")))]
942    {
943        let _ = file;
944        let _ = stat;
945        Ok(None)
946    }
947
948    #[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))]
949    find_sparse_entries_seek(file, stat)
950}
951
952/// Implementation of `find_sparse_entries` using `SEEK_HOLE` and `SEEK_DATA`.
953#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))]
954fn find_sparse_entries_seek(
955    file: &mut fs::File,
956    stat: &fs::Metadata,
957) -> io::Result<Option<SparseEntries>> {
958    use std::os::unix::fs::MetadataExt as _;
959    use std::os::unix::io::AsRawFd as _;
960
961    fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result<i64, i32> {
962        #[cfg(any(target_os = "linux", target_os = "android"))]
963        let lseek = libc::lseek64;
964        #[cfg(not(any(target_os = "linux", target_os = "android")))]
965        let lseek = libc::lseek;
966
967        match unsafe { lseek(file.as_raw_fd(), offset, whence) } {
968            -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()),
969            off => Ok(off),
970        }
971    }
972
973    if stat.blocks() == 0 {
974        return Ok(if stat.size() == 0 {
975            // Empty file.
976            None
977        } else {
978            // Fully sparse file.
979            Some(SparseEntries {
980                entries: vec![SparseEntry {
981                    offset: stat.size(),
982                    num_bytes: 0,
983                }],
984                on_disk_size: 0,
985            })
986        });
987    }
988
989    // On most Unixes, we need to read `_PC_MIN_HOLE_SIZE` to see if the file
990    // system supports `SEEK_HOLE`.
991    // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE
992    #[cfg(not(any(target_os = "linux", target_os = "android")))]
993    if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 {
994        return Ok(None);
995    }
996
997    // Linux is the only UNIX-like without support for `_PC_MIN_HOLE_SIZE`, so
998    // instead we try to call `lseek` and see if it fails.
999    #[cfg(any(target_os = "linux", target_os = "android"))]
1000    match lseek(file, 0, libc::SEEK_HOLE) {
1001        Ok(_) => (),
1002        Err(libc::ENXIO) => {
1003            // The file is empty. Treat it as non-sparse.
1004            return Ok(None);
1005        }
1006        Err(_) => return Ok(None),
1007    }
1008
1009    let mut entries = Vec::new();
1010    let mut on_disk_size = 0;
1011    let mut off_s = 0;
1012    loop {
1013        //  off_s=0      │     off_s               │ off_s
1014        //    ↓          │       ↓                 │   ↓
1015        //    | DATA |…  │  ……………| HOLE | DATA |…  │  …|×EOF×
1016        //    ↑          │       ↑      ↑          │
1017        //   (a)         │  (b) (c)    (d)         │     (e)
1018        match lseek(file, off_s, libc::SEEK_DATA) {
1019            Ok(0) if off_s == 0 => (), // (a) The file starts with data.
1020            Ok(off) if off < off_s => {
1021                // (b) Unlikely.
1022                return Err(std::io::Error::new(
1023                    io::ErrorKind::Other,
1024                    "lseek(SEEK_DATA) went backwards",
1025                ));
1026            }
1027            Ok(off) if off == off_s => {
1028                // (c) The data at the same offset as the hole.
1029                return Err(std::io::Error::new(
1030                    io::ErrorKind::Other,
1031                    "lseek(SEEK_DATA) did not advance. \
1032                     Did the file change while appending?",
1033                ));
1034            }
1035            Ok(off) => off_s = off,    // (d) Jump to the next hole.
1036            Err(libc::ENXIO) => break, // (e) Reached the end of the file.
1037            Err(errno) => return Err(io::Error::from_raw_os_error(errno)),
1038        };
1039
1040        // off_s=0          │     off_s               │    off_s
1041        //   ↓              │       ↓                 │      ↓
1042        //   | DATA |×EOF×  │  ……………| DATA | HOLE |…  │  …|×EOF×
1043        //          ↑       │       ↑      ↑          │
1044        //         (a)      │  (b) (c)    (d)         │     (e)
1045        match lseek(file, off_s, libc::SEEK_HOLE) {
1046            Ok(off_e) if off_s == 0 && (off_e as u64) == stat.size() => {
1047                // (a) The file is not sparse.
1048                file.seek(io::SeekFrom::Start(0))?;
1049                return Ok(None);
1050            }
1051            Ok(off_e) if off_e < off_s => {
1052                // (b) Unlikely.
1053                return Err(std::io::Error::new(
1054                    io::ErrorKind::Other,
1055                    "lseek(SEEK_HOLE) went backwards",
1056                ));
1057            }
1058            Ok(off_e) if off_e == off_s => {
1059                // (c) The hole at the same offset as the data.
1060                return Err(std::io::Error::new(
1061                    io::ErrorKind::Other,
1062                    "lseek(SEEK_HOLE) did not advance. \
1063                     Did the file change while appending?",
1064                ));
1065            }
1066            Ok(off_e) => {
1067                // (d) Found a hole or reached the end of the file (implicit
1068                // zero-length hole).
1069                entries.push(SparseEntry {
1070                    offset: off_s as u64,
1071                    num_bytes: off_e as u64 - off_s as u64,
1072                });
1073                on_disk_size += off_e as u64 - off_s as u64;
1074                off_s = off_e;
1075            }
1076            Err(libc::ENXIO) => {
1077                // (e) off_s was already beyond the end of the file.
1078                return Err(std::io::Error::new(
1079                    io::ErrorKind::Other,
1080                    "lseek(SEEK_HOLE) returned ENXIO. \
1081                     Did the file change while appending?",
1082                ));
1083            }
1084            Err(errno) => return Err(io::Error::from_raw_os_error(errno)),
1085        };
1086    }
1087
1088    if off_s as u64 > stat.size() {
1089        return Err(std::io::Error::new(
1090            io::ErrorKind::Other,
1091            "lseek(SEEK_DATA) went beyond the end of the file. \
1092             Did the file change while appending?",
1093        ));
1094    }
1095
1096    // Add a final zero-length entry. It is required if the file ends with a
1097    // hole, and redundant otherwise. However, we add it unconditionally to
1098    // mimic GNU tar behavior.
1099    entries.push(SparseEntry {
1100        offset: stat.size(),
1101        num_bytes: 0,
1102    });
1103
1104    file.seek(io::SeekFrom::Start(0))?;
1105
1106    Ok(Some(SparseEntries {
1107        entries,
1108        on_disk_size,
1109    }))
1110}
1111
1112impl<W: Write> Drop for Builder<W> {
1113    fn drop(&mut self) {
1114        let _ = self.finish();
1115    }
1116}
1117
1118#[cfg(test)]
1119mod tests {
1120    use super::*;
1121
1122    /// Should be multiple of 4KiB on ext4, multiple of 32KiB on FreeBSD/UFS, multiple of 64KiB on
1123    /// ppc64el
1124    const SPARSE_BLOCK_SIZE: u64 = 64 * 1024;
1125
1126    #[test]
1127    fn test_find_sparse_entries() {
1128        let cases: &[(&str, &[SparseEntry])] = &[
1129            ("|", &[]),
1130            (
1131                "|    |    |    |    |",
1132                &[SparseEntry {
1133                    offset: 4 * SPARSE_BLOCK_SIZE,
1134                    num_bytes: 0,
1135                }],
1136            ),
1137            (
1138                "|####|####|####|####|",
1139                &[
1140                    SparseEntry {
1141                        offset: 0,
1142                        num_bytes: 4 * SPARSE_BLOCK_SIZE,
1143                    },
1144                    SparseEntry {
1145                        offset: 4 * SPARSE_BLOCK_SIZE,
1146                        num_bytes: 0,
1147                    },
1148                ],
1149            ),
1150            (
1151                "|####|####|    |    |",
1152                &[
1153                    SparseEntry {
1154                        offset: 0,
1155                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1156                    },
1157                    SparseEntry {
1158                        offset: 4 * SPARSE_BLOCK_SIZE,
1159                        num_bytes: 0,
1160                    },
1161                ],
1162            ),
1163            (
1164                "|    |    |####|####|",
1165                &[
1166                    SparseEntry {
1167                        offset: 2 * SPARSE_BLOCK_SIZE,
1168                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1169                    },
1170                    SparseEntry {
1171                        offset: 4 * SPARSE_BLOCK_SIZE,
1172                        num_bytes: 0,
1173                    },
1174                ],
1175            ),
1176            (
1177                "|####|    |####|    |",
1178                &[
1179                    SparseEntry {
1180                        offset: 0,
1181                        num_bytes: SPARSE_BLOCK_SIZE,
1182                    },
1183                    SparseEntry {
1184                        offset: 2 * SPARSE_BLOCK_SIZE,
1185                        num_bytes: SPARSE_BLOCK_SIZE,
1186                    },
1187                    SparseEntry {
1188                        offset: 4 * SPARSE_BLOCK_SIZE,
1189                        num_bytes: 0,
1190                    },
1191                ],
1192            ),
1193            (
1194                "|####|    |    |####|",
1195                &[
1196                    SparseEntry {
1197                        offset: 0,
1198                        num_bytes: SPARSE_BLOCK_SIZE,
1199                    },
1200                    SparseEntry {
1201                        offset: 3 * SPARSE_BLOCK_SIZE,
1202                        num_bytes: SPARSE_BLOCK_SIZE,
1203                    },
1204                    SparseEntry {
1205                        offset: 4 * SPARSE_BLOCK_SIZE,
1206                        num_bytes: 0,
1207                    },
1208                ],
1209            ),
1210            (
1211                "|    |####|####|    |",
1212                &[
1213                    SparseEntry {
1214                        offset: SPARSE_BLOCK_SIZE,
1215                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1216                    },
1217                    SparseEntry {
1218                        offset: 4 * SPARSE_BLOCK_SIZE,
1219                        num_bytes: 0,
1220                    },
1221                ],
1222            ),
1223        ];
1224
1225        let mut file = tempfile::tempfile().unwrap();
1226
1227        for &(description, map) in cases {
1228            file.set_len(0).unwrap();
1229            file.set_len(map.last().map_or(0, |e| e.offset + e.num_bytes))
1230                .unwrap();
1231
1232            for e in map {
1233                file.seek(io::SeekFrom::Start(e.offset)).unwrap();
1234                for _ in 0..e.num_bytes / SPARSE_BLOCK_SIZE {
1235                    file.write_all(&[0xFF; SPARSE_BLOCK_SIZE as usize]).unwrap();
1236                }
1237            }
1238
1239            let expected = match map {
1240                // Empty file.
1241                &[] => None,
1242
1243                // 100% dense.
1244                &[SparseEntry {
1245                    offset: 0,
1246                    num_bytes: x1,
1247                }, SparseEntry {
1248                    offset: x2,
1249                    num_bytes: 0,
1250                }] if x1 == x2 => None,
1251
1252                // Sparse.
1253                map => Some(SparseEntries {
1254                    entries: map.to_vec(),
1255                    on_disk_size: map.iter().map(|e| e.num_bytes).sum(),
1256                }),
1257            };
1258
1259            let stat = file.metadata().unwrap();
1260            let reported = find_sparse_entries(&mut file, &stat).unwrap();
1261
1262            // Loose check: we did not miss any data blocks.
1263            if let Err(e) = loose_check_sparse_entries(reported.as_ref(), expected.as_ref()) {
1264                panic!(
1265                    "Case: {description}\n\
1266                     Reported: {reported:?}\n\
1267                     Expected: {expected:?}\n\
1268                     Error: {e}",
1269                );
1270            }
1271
1272            // On Linux, always do a strict check. Skip on FreeBSD, as on UFS
1273            // the last block is always dense, even if it's zero-filled.
1274            #[cfg(any(target_os = "android", target_os = "linux"))]
1275            assert_eq!(reported, expected, "Case: {description}");
1276        }
1277    }
1278
1279    fn loose_check_sparse_entries(
1280        reported: Option<&SparseEntries>,
1281        expected: Option<&SparseEntries>,
1282    ) -> Result<(), &'static str> {
1283        let reported = match reported {
1284            Some(entries) => entries, // Reported as sparse.
1285            // It's not an error to report a sparse file as non-sparse.
1286            None => return Ok(()),
1287        };
1288        let expected = match expected {
1289            Some(entries) => entries,
1290            None => return Err("Expected dense file, but reported as sparse"),
1291        };
1292
1293        // Check that we didn't miss any data blocks. However, reporting some
1294        // holes as data is not an error during the loose check.
1295        if expected.entries.iter().any(|e| {
1296            !reported
1297                .entries
1298                .iter()
1299                .any(|r| e.offset >= r.offset && e.offset + e.num_bytes <= r.offset + r.num_bytes)
1300        }) {
1301            return Err("Reported is not a superset of expected");
1302        }
1303
1304        if reported.entries.last() != expected.entries.last() {
1305            return Err("Last zero-length entry is not as expected");
1306        }
1307
1308        // Check invariants of SparseEntries.
1309        let mut prev_end = None;
1310        for e in &reported.entries[..reported.entries.len()] {
1311            if prev_end.map_or(false, |p| e.offset < p) {
1312                return Err("Overlapping or unsorted entries");
1313            }
1314            prev_end = Some(e.offset + e.num_bytes);
1315        }
1316
1317        if reported.on_disk_size != reported.entries.iter().map(|e| e.num_bytes).sum() {
1318            return Err("Incorrect on-disk size");
1319        }
1320
1321        Ok(())
1322    }
1323}