tar/
builder.rs

1use std::fs;
2use std::io;
3use std::io::prelude::*;
4use std::path::Path;
5use std::str;
6
7use crate::header::BLOCK_SIZE;
8use crate::header::GNU_SPARSE_HEADERS_COUNT;
9use crate::header::{path2bytes, HeaderMode};
10use crate::GnuExtSparseHeader;
11use crate::{other, EntryType, Header};
12
13/// A structure for building archives
14///
15/// This structure has methods for building up an archive from scratch into any
16/// arbitrary writer.
17pub struct Builder<W: Write> {
18    options: BuilderOptions,
19    finished: bool,
20    obj: Option<W>,
21}
22
23#[derive(Clone, Copy)]
24struct BuilderOptions {
25    mode: HeaderMode,
26    follow: bool,
27    sparse: bool,
28}
29
30impl<W: Write> Builder<W> {
31    /// Create a new archive builder with the underlying object as the
32    /// destination of all data written. The builder will use
33    /// `HeaderMode::Complete` by default.
34    pub fn new(obj: W) -> Builder<W> {
35        Builder {
36            options: BuilderOptions {
37                mode: HeaderMode::Complete,
38                follow: true,
39                sparse: true,
40            },
41            finished: false,
42            obj: Some(obj),
43        }
44    }
45
46    /// Changes the HeaderMode that will be used when reading fs Metadata for
47    /// methods that implicitly read metadata for an input Path. Notably, this
48    /// does _not_ apply to `append(Header)`.
49    pub fn mode(&mut self, mode: HeaderMode) {
50        self.options.mode = mode;
51    }
52
53    /// Follow symlinks, archiving the contents of the file they point to rather
54    /// than adding a symlink to the archive. Defaults to true.
55    ///
56    /// When true, it exhibits the same behavior as GNU `tar` command's
57    /// `--dereference` or `-h` options <https://man7.org/linux/man-pages/man1/tar.1.html>.
58    pub fn follow_symlinks(&mut self, follow: bool) {
59        self.options.follow = follow;
60    }
61
62    /// Handle sparse files efficiently, if supported by the underlying
63    /// filesystem. When true, sparse file information is read from disk and
64    /// empty segments are omitted from the archive. Defaults to true.
65    pub fn sparse(&mut self, sparse: bool) {
66        self.options.sparse = sparse;
67    }
68
69    /// Gets shared reference to the underlying object.
70    pub fn get_ref(&self) -> &W {
71        self.obj.as_ref().unwrap()
72    }
73
74    /// Gets mutable reference to the underlying object.
75    ///
76    /// Note that care must be taken while writing to the underlying
77    /// object. But, e.g. `get_mut().flush()` is claimed to be safe and
78    /// useful in the situations when one needs to be ensured that
79    /// tar entry was flushed to the disk.
80    pub fn get_mut(&mut self) -> &mut W {
81        self.obj.as_mut().unwrap()
82    }
83
84    /// Unwrap this archive, returning the underlying object.
85    ///
86    /// This function will finish writing the archive if the `finish` function
87    /// hasn't yet been called, returning any I/O error which happens during
88    /// that operation.
89    pub fn into_inner(mut self) -> io::Result<W> {
90        if !self.finished {
91            self.finish()?;
92        }
93        Ok(self.obj.take().unwrap())
94    }
95
96    /// Adds a new entry to this archive.
97    ///
98    /// This function will append the header specified, followed by contents of
99    /// the stream specified by `data`. To produce a valid archive the `size`
100    /// field of `header` must be the same as the length of the stream that's
101    /// being written. Additionally the checksum for the header should have been
102    /// set via the `set_cksum` method.
103    ///
104    /// Note that this will not attempt to seek the archive to a valid position,
105    /// so if the archive is in the middle of a read or some other similar
106    /// operation then this may corrupt the archive.
107    ///
108    /// Also note that after all entries have been written to an archive the
109    /// `finish` function needs to be called to finish writing the archive.
110    ///
111    /// # Errors
112    ///
113    /// This function will return an error for any intermittent I/O error which
114    /// occurs when either reading or writing.
115    ///
116    /// # Examples
117    ///
118    /// ```
119    /// use tar::{Builder, Header};
120    ///
121    /// let mut header = Header::new_gnu();
122    /// header.set_path("foo").unwrap();
123    /// header.set_size(4);
124    /// header.set_cksum();
125    ///
126    /// let mut data: &[u8] = &[1, 2, 3, 4];
127    ///
128    /// let mut ar = Builder::new(Vec::new());
129    /// ar.append(&header, data).unwrap();
130    /// let data = ar.into_inner().unwrap();
131    /// ```
132    pub fn append<R: Read>(&mut self, header: &Header, mut data: R) -> io::Result<()> {
133        append(self.get_mut(), header, &mut data)
134    }
135
136    /// Adds a new entry to this archive with the specified path.
137    ///
138    /// This function will set the specified path in the given header, which may
139    /// require appending a GNU long-name extension entry to the archive first.
140    /// The checksum for the header will be automatically updated via the
141    /// `set_cksum` method after setting the path. No other metadata in the
142    /// header will be modified.
143    ///
144    /// Then it will append the header, followed by contents of the stream
145    /// specified by `data`. To produce a valid archive the `size` field of
146    /// `header` must be the same as the length of the stream that's being
147    /// written.
148    ///
149    /// Note that this will not attempt to seek the archive to a valid position,
150    /// so if the archive is in the middle of a read or some other similar
151    /// operation then this may corrupt the archive.
152    ///
153    /// Also note that after all entries have been written to an archive the
154    /// `finish` function needs to be called to finish writing the archive.
155    ///
156    /// # Errors
157    ///
158    /// This function will return an error for any intermittent I/O error which
159    /// occurs when either reading or writing.
160    ///
161    /// # Examples
162    ///
163    /// ```
164    /// use tar::{Builder, Header};
165    ///
166    /// let mut header = Header::new_gnu();
167    /// header.set_size(4);
168    /// header.set_cksum();
169    ///
170    /// let mut data: &[u8] = &[1, 2, 3, 4];
171    ///
172    /// let mut ar = Builder::new(Vec::new());
173    /// ar.append_data(&mut header, "really/long/path/to/foo", data).unwrap();
174    /// let data = ar.into_inner().unwrap();
175    /// ```
176    pub fn append_data<P: AsRef<Path>, R: Read>(
177        &mut self,
178        header: &mut Header,
179        path: P,
180        data: R,
181    ) -> io::Result<()> {
182        prepare_header_path(self.get_mut(), header, path.as_ref())?;
183        header.set_cksum();
184        self.append(&header, data)
185    }
186
187    /// Adds a new entry to this archive and returns an [`EntryWriter`] for
188    /// adding its contents.
189    ///
190    /// This function is similar to [`Self::append_data`] but returns a
191    /// [`io::Write`] implementation instead of taking data as a parameter.
192    ///
193    /// Similar constraints around the position of the archive and completion
194    /// apply as with [`Self::append_data`]. It requires the underlying writer
195    /// to implement [`Seek`] to update the header after writing the data.
196    ///
197    /// # Errors
198    ///
199    /// This function will return an error for any intermittent I/O error which
200    /// occurs when either reading or writing.
201    ///
202    /// # Examples
203    ///
204    /// ```
205    /// use std::io::Cursor;
206    /// use std::io::Write as _;
207    /// use tar::{Builder, Header};
208    ///
209    /// let mut header = Header::new_gnu();
210    ///
211    /// let mut ar = Builder::new(Cursor::new(Vec::new()));
212    /// let mut entry = ar.append_writer(&mut header, "hi.txt").unwrap();
213    /// entry.write_all(b"Hello, ").unwrap();
214    /// entry.write_all(b"world!\n").unwrap();
215    /// entry.finish().unwrap();
216    /// ```
217    pub fn append_writer<'a, P: AsRef<Path>>(
218        &'a mut self,
219        header: &'a mut Header,
220        path: P,
221    ) -> io::Result<EntryWriter<'a>>
222    where
223        W: Seek,
224    {
225        EntryWriter::start(self.get_mut(), header, path.as_ref())
226    }
227
228    /// Adds a new link (symbolic or hard) entry to this archive with the specified path and target.
229    ///
230    /// This function is similar to [`Self::append_data`] which supports long filenames,
231    /// but also supports long link targets using GNU extensions if necessary.
232    /// You must set the entry type to either [`EntryType::Link`] or [`EntryType::Symlink`].
233    /// The `set_cksum` method will be invoked after setting the path. No other metadata in the
234    /// header will be modified.
235    ///
236    /// If you are intending to use GNU extensions, you must use this method over calling
237    /// [`Header::set_link_name`] because that function will fail on long links.
238    ///
239    /// Similar constraints around the position of the archive and completion
240    /// apply as with [`Self::append_data`].
241    ///
242    /// # Errors
243    ///
244    /// This function will return an error for any intermittent I/O error which
245    /// occurs when either reading or writing.
246    ///
247    /// # Examples
248    ///
249    /// ```
250    /// use tar::{Builder, Header, EntryType};
251    ///
252    /// let mut ar = Builder::new(Vec::new());
253    /// let mut header = Header::new_gnu();
254    /// header.set_username("foo");
255    /// header.set_entry_type(EntryType::Symlink);
256    /// header.set_size(0);
257    /// ar.append_link(&mut header, "really/long/path/to/foo", "other/really/long/target").unwrap();
258    /// let data = ar.into_inner().unwrap();
259    /// ```
260    pub fn append_link<P: AsRef<Path>, T: AsRef<Path>>(
261        &mut self,
262        header: &mut Header,
263        path: P,
264        target: T,
265    ) -> io::Result<()> {
266        self._append_link(header, path.as_ref(), target.as_ref())
267    }
268
269    fn _append_link(&mut self, header: &mut Header, path: &Path, target: &Path) -> io::Result<()> {
270        prepare_header_path(self.get_mut(), header, path)?;
271        prepare_header_link(self.get_mut(), header, target)?;
272        header.set_cksum();
273        self.append(&header, std::io::empty())
274    }
275
276    /// Adds a file on the local filesystem to this archive.
277    ///
278    /// This function will open the file specified by `path` and insert the file
279    /// into the archive with the appropriate metadata set, returning any I/O
280    /// error which occurs while writing. The path name for the file inside of
281    /// this archive will be the same as `path`, and it is required that the
282    /// path is a relative path.
283    ///
284    /// Note that this will not attempt to seek the archive to a valid position,
285    /// so if the archive is in the middle of a read or some other similar
286    /// operation then this may corrupt the archive.
287    ///
288    /// Also note that after all files have been written to an archive the
289    /// `finish` function needs to be called to finish writing the archive.
290    ///
291    /// # Examples
292    ///
293    /// ```no_run
294    /// use tar::Builder;
295    ///
296    /// let mut ar = Builder::new(Vec::new());
297    ///
298    /// ar.append_path("foo/bar.txt").unwrap();
299    /// ```
300    pub fn append_path<P: AsRef<Path>>(&mut self, path: P) -> io::Result<()> {
301        let options = self.options;
302        append_path_with_name(self.get_mut(), path.as_ref(), None, options)
303    }
304
305    /// Adds a file on the local filesystem to this archive under another name.
306    ///
307    /// This function will open the file specified by `path` and insert the file
308    /// into the archive as `name` with appropriate metadata set, returning any
309    /// I/O error which occurs while writing. The path name for the file inside
310    /// of this archive will be `name` is required to be a relative path.
311    ///
312    /// Note that this will not attempt to seek the archive to a valid position,
313    /// so if the archive is in the middle of a read or some other similar
314    /// operation then this may corrupt the archive.
315    ///
316    /// Note if the `path` is a directory. This will just add an entry to the archive,
317    /// rather than contents of the directory.
318    ///
319    /// Also note that after all files have been written to an archive the
320    /// `finish` function needs to be called to finish writing the archive.
321    ///
322    /// # Examples
323    ///
324    /// ```no_run
325    /// use tar::Builder;
326    ///
327    /// let mut ar = Builder::new(Vec::new());
328    ///
329    /// // Insert the local file "foo/bar.txt" in the archive but with the name
330    /// // "bar/foo.txt".
331    /// ar.append_path_with_name("foo/bar.txt", "bar/foo.txt").unwrap();
332    /// ```
333    pub fn append_path_with_name<P: AsRef<Path>, N: AsRef<Path>>(
334        &mut self,
335        path: P,
336        name: N,
337    ) -> io::Result<()> {
338        let options = self.options;
339        append_path_with_name(self.get_mut(), path.as_ref(), Some(name.as_ref()), options)
340    }
341
342    /// Adds a file to this archive with the given path as the name of the file
343    /// in the archive.
344    ///
345    /// This will use the metadata of `file` to populate a `Header`, and it will
346    /// then append the file to the archive with the name `path`.
347    ///
348    /// Note that this will not attempt to seek the archive to a valid position,
349    /// so if the archive is in the middle of a read or some other similar
350    /// operation then this may corrupt the archive.
351    ///
352    /// Also note that after all files have been written to an archive the
353    /// `finish` function needs to be called to finish writing the archive.
354    ///
355    /// # Examples
356    ///
357    /// ```no_run
358    /// use std::fs::File;
359    /// use tar::Builder;
360    ///
361    /// let mut ar = Builder::new(Vec::new());
362    ///
363    /// // Open the file at one location, but insert it into the archive with a
364    /// // different name.
365    /// let mut f = File::open("foo/bar/baz.txt").unwrap();
366    /// ar.append_file("bar/baz.txt", &mut f).unwrap();
367    /// ```
368    pub fn append_file<P: AsRef<Path>>(&mut self, path: P, file: &mut fs::File) -> io::Result<()> {
369        let options = self.options;
370        append_file(self.get_mut(), path.as_ref(), file, options)
371    }
372
373    /// Adds a directory to this archive with the given path as the name of the
374    /// directory in the archive.
375    ///
376    /// This will use `stat` to populate a `Header`, and it will then append the
377    /// directory to the archive with the name `path`.
378    ///
379    /// Note that this will not attempt to seek the archive to a valid position,
380    /// so if the archive is in the middle of a read or some other similar
381    /// operation then this may corrupt the archive.
382    ///
383    /// Note this will not add the contents of the directory to the archive.
384    /// See `append_dir_all` for recusively adding the contents of the directory.
385    ///
386    /// Also note that after all files have been written to an archive the
387    /// `finish` function needs to be called to finish writing the archive.
388    ///
389    /// # Examples
390    ///
391    /// ```
392    /// use std::fs;
393    /// use tar::Builder;
394    ///
395    /// let mut ar = Builder::new(Vec::new());
396    ///
397    /// // Use the directory at one location, but insert it into the archive
398    /// // with a different name.
399    /// ar.append_dir("bardir", ".").unwrap();
400    /// ```
401    pub fn append_dir<P, Q>(&mut self, path: P, src_path: Q) -> io::Result<()>
402    where
403        P: AsRef<Path>,
404        Q: AsRef<Path>,
405    {
406        let options = self.options;
407        append_dir(self.get_mut(), path.as_ref(), src_path.as_ref(), options)
408    }
409
410    /// Adds a directory and all of its contents (recursively) to this archive
411    /// with the given path as the name of the directory in the archive.
412    ///
413    /// Note that this will not attempt to seek the archive to a valid position,
414    /// so if the archive is in the middle of a read or some other similar
415    /// operation then this may corrupt the archive.
416    ///
417    /// Also note that after all files have been written to an archive the
418    /// `finish` or `into_inner` function needs to be called to finish
419    /// writing the archive.
420    ///
421    /// # Examples
422    ///
423    /// ```
424    /// use std::fs;
425    /// use tar::Builder;
426    ///
427    /// let mut ar = Builder::new(Vec::new());
428    ///
429    /// // Use the directory at one location ("."), but insert it into the archive
430    /// // with a different name ("bardir").
431    /// ar.append_dir_all("bardir", ".").unwrap();
432    /// ar.finish().unwrap();
433    /// ```
434    ///
435    /// Use `append_dir_all` with an empty string as the first path argument to
436    /// create an archive from all files in a directory without renaming.
437    ///
438    /// ```
439    /// use std::fs;
440    /// use std::path::PathBuf;
441    /// use tar::{Archive, Builder};
442    ///
443    /// let tmpdir = tempfile::tempdir().unwrap();
444    /// let path = tmpdir.path();
445    /// fs::write(path.join("a.txt"), b"hello").unwrap();
446    /// fs::write(path.join("b.txt"), b"world").unwrap();
447    ///
448    /// // Create a tarball from the files in the directory
449    /// let mut ar = Builder::new(Vec::new());
450    /// ar.append_dir_all("", path).unwrap();
451    ///
452    /// // List files in the archive
453    /// let archive = ar.into_inner().unwrap();
454    /// let archived_files = Archive::new(archive.as_slice())
455    ///     .entries()
456    ///     .unwrap()
457    ///     .map(|entry| entry.unwrap().path().unwrap().into_owned())
458    ///     .collect::<Vec<_>>();
459    ///
460    /// assert!(archived_files.contains(&PathBuf::from("a.txt")));
461    /// assert!(archived_files.contains(&PathBuf::from("b.txt")));
462    /// ```
463    pub fn append_dir_all<P, Q>(&mut self, path: P, src_path: Q) -> io::Result<()>
464    where
465        P: AsRef<Path>,
466        Q: AsRef<Path>,
467    {
468        let options = self.options;
469        append_dir_all(self.get_mut(), path.as_ref(), src_path.as_ref(), options)
470    }
471
472    /// Finish writing this archive, emitting the termination sections.
473    ///
474    /// This function should only be called when the archive has been written
475    /// entirely and if an I/O error happens the underlying object still needs
476    /// to be acquired.
477    ///
478    /// In most situations the `into_inner` method should be preferred.
479    pub fn finish(&mut self) -> io::Result<()> {
480        if self.finished {
481            return Ok(());
482        }
483        self.finished = true;
484        self.get_mut().write_all(&[0; 1024])
485    }
486}
487
488trait SeekWrite: Write + Seek {
489    fn as_write(&mut self) -> &mut dyn Write;
490}
491
492impl<T: Write + Seek> SeekWrite for T {
493    fn as_write(&mut self) -> &mut dyn Write {
494        self
495    }
496}
497
498/// A writer for a single entry in a tar archive.
499///
500/// This struct is returned by [`Builder::append_writer`] and provides a
501/// [`Write`] implementation for adding content to an archive entry.
502///
503/// After writing all data to the entry, it must be finalized either by
504/// explicitly calling [`EntryWriter::finish`] or by letting it drop.
505pub struct EntryWriter<'a> {
506    // NOTE: Do not add any fields here which require Drop!
507    // See the comment below in finish().
508    obj: &'a mut dyn SeekWrite,
509    header: &'a mut Header,
510    written: u64,
511}
512
513impl EntryWriter<'_> {
514    fn start<'a>(
515        obj: &'a mut dyn SeekWrite,
516        header: &'a mut Header,
517        path: &Path,
518    ) -> io::Result<EntryWriter<'a>> {
519        prepare_header_path(obj.as_write(), header, path)?;
520
521        // Reserve space for header, will be overwritten once data is written.
522        obj.write_all([0u8; BLOCK_SIZE as usize].as_ref())?;
523
524        Ok(EntryWriter {
525            obj,
526            header,
527            written: 0,
528        })
529    }
530
531    /// Finish writing the current entry in the archive.
532    pub fn finish(self) -> io::Result<()> {
533        // NOTE: This is an optimization for "fallible destructuring".
534        // We want finish() to return an error, but we also need to invoke
535        // cleanup in our Drop handler, which will run unconditionally
536        // and try to do the same work.
537        // By using ManuallyDrop, we suppress that drop. However, this would
538        // be a memory leak if we ever had any struct members which required
539        // Drop - which we don't right now.
540        // But if we ever gain one, we will need to change to use e.g. Option<>
541        // around some of the fields or have a `bool finished` etc.
542        let mut this = std::mem::ManuallyDrop::new(self);
543        this.do_finish()
544    }
545
546    fn do_finish(&mut self) -> io::Result<()> {
547        // Pad with zeros if necessary.
548        let buf = [0u8; BLOCK_SIZE as usize];
549        let remaining = BLOCK_SIZE.wrapping_sub(self.written) % BLOCK_SIZE;
550        self.obj.write_all(&buf[..remaining as usize])?;
551        let written = (self.written + remaining) as i64;
552
553        // Seek back to the header position.
554        self.obj
555            .seek(io::SeekFrom::Current(-written - BLOCK_SIZE as i64))?;
556
557        self.header.set_size(self.written);
558        self.header.set_cksum();
559        self.obj.write_all(self.header.as_bytes())?;
560
561        // Seek forward to restore the position.
562        self.obj.seek(io::SeekFrom::Current(written))?;
563
564        Ok(())
565    }
566}
567
568impl Write for EntryWriter<'_> {
569    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
570        let len = self.obj.write(buf)?;
571        self.written += len as u64;
572        Ok(len)
573    }
574
575    fn flush(&mut self) -> io::Result<()> {
576        self.obj.flush()
577    }
578}
579
580impl Drop for EntryWriter<'_> {
581    fn drop(&mut self) {
582        let _ = self.do_finish();
583    }
584}
585
586fn append(mut dst: &mut dyn Write, header: &Header, mut data: &mut dyn Read) -> io::Result<()> {
587    dst.write_all(header.as_bytes())?;
588    let len = io::copy(&mut data, &mut dst)?;
589    pad_zeroes(&mut dst, len)?;
590    Ok(())
591}
592
593fn pad_zeroes(dst: &mut dyn Write, len: u64) -> io::Result<()> {
594    let buf = [0; BLOCK_SIZE as usize];
595    let remaining = BLOCK_SIZE - (len % BLOCK_SIZE);
596    if remaining < BLOCK_SIZE {
597        dst.write_all(&buf[..remaining as usize])?;
598    }
599    Ok(())
600}
601
602fn append_path_with_name(
603    dst: &mut dyn Write,
604    path: &Path,
605    name: Option<&Path>,
606    options: BuilderOptions,
607) -> io::Result<()> {
608    let stat = if options.follow {
609        fs::metadata(path).map_err(|err| {
610            io::Error::new(
611                err.kind(),
612                format!("{} when getting metadata for {}", err, path.display()),
613            )
614        })?
615    } else {
616        fs::symlink_metadata(path).map_err(|err| {
617            io::Error::new(
618                err.kind(),
619                format!("{} when getting metadata for {}", err, path.display()),
620            )
621        })?
622    };
623    let ar_name = name.unwrap_or(path);
624    if stat.is_file() {
625        append_file(dst, ar_name, &mut fs::File::open(path)?, options)
626    } else if stat.is_dir() {
627        append_fs(dst, ar_name, &stat, options.mode, None)
628    } else if stat.file_type().is_symlink() {
629        let link_name = fs::read_link(path)?;
630        append_fs(dst, ar_name, &stat, options.mode, Some(&link_name))
631    } else {
632        #[cfg(unix)]
633        {
634            append_special(dst, path, &stat, options.mode)
635        }
636        #[cfg(not(unix))]
637        {
638            Err(other(&format!("{} has unknown file type", path.display())))
639        }
640    }
641}
642
643#[cfg(unix)]
644fn append_special(
645    dst: &mut dyn Write,
646    path: &Path,
647    stat: &fs::Metadata,
648    mode: HeaderMode,
649) -> io::Result<()> {
650    use ::std::os::unix::fs::{FileTypeExt, MetadataExt};
651
652    let file_type = stat.file_type();
653    let entry_type;
654    if file_type.is_socket() {
655        // sockets can't be archived
656        return Err(other(&format!(
657            "{}: socket can not be archived",
658            path.display()
659        )));
660    } else if file_type.is_fifo() {
661        entry_type = EntryType::Fifo;
662    } else if file_type.is_char_device() {
663        entry_type = EntryType::Char;
664    } else if file_type.is_block_device() {
665        entry_type = EntryType::Block;
666    } else {
667        return Err(other(&format!("{} has unknown file type", path.display())));
668    }
669
670    let mut header = Header::new_gnu();
671    header.set_metadata_in_mode(stat, mode);
672    prepare_header_path(dst, &mut header, path)?;
673
674    header.set_entry_type(entry_type);
675    let dev_id = stat.rdev();
676    let dev_major = ((dev_id >> 32) & 0xffff_f000) | ((dev_id >> 8) & 0x0000_0fff);
677    let dev_minor = ((dev_id >> 12) & 0xffff_ff00) | ((dev_id) & 0x0000_00ff);
678    header.set_device_major(dev_major as u32)?;
679    header.set_device_minor(dev_minor as u32)?;
680
681    header.set_cksum();
682    dst.write_all(header.as_bytes())?;
683
684    Ok(())
685}
686
687fn append_file(
688    dst: &mut dyn Write,
689    path: &Path,
690    file: &mut fs::File,
691    options: BuilderOptions,
692) -> io::Result<()> {
693    let stat = file.metadata()?;
694    let mut header = Header::new_gnu();
695
696    prepare_header_path(dst, &mut header, path)?;
697    header.set_metadata_in_mode(&stat, options.mode);
698    let sparse_entries = if options.sparse {
699        prepare_header_sparse(file, &stat, &mut header)?
700    } else {
701        None
702    };
703    header.set_cksum();
704    dst.write_all(header.as_bytes())?;
705
706    if let Some(sparse_entries) = sparse_entries {
707        append_extended_sparse_headers(dst, &sparse_entries)?;
708        for entry in sparse_entries.entries {
709            file.seek(io::SeekFrom::Start(entry.offset))?;
710            io::copy(&mut file.take(entry.num_bytes), dst)?;
711        }
712        pad_zeroes(dst, sparse_entries.on_disk_size)?;
713    } else {
714        let len = io::copy(file, dst)?;
715        pad_zeroes(dst, len)?;
716    }
717
718    Ok(())
719}
720
721fn append_dir(
722    dst: &mut dyn Write,
723    path: &Path,
724    src_path: &Path,
725    options: BuilderOptions,
726) -> io::Result<()> {
727    let stat = fs::metadata(src_path)?;
728    append_fs(dst, path, &stat, options.mode, None)
729}
730
731fn prepare_header(size: u64, entry_type: u8) -> Header {
732    let mut header = Header::new_gnu();
733    let name = b"././@LongLink";
734    header.as_gnu_mut().unwrap().name[..name.len()].clone_from_slice(&name[..]);
735    header.set_mode(0o644);
736    header.set_uid(0);
737    header.set_gid(0);
738    header.set_mtime(0);
739    // + 1 to be compliant with GNU tar
740    header.set_size(size + 1);
741    header.set_entry_type(EntryType::new(entry_type));
742    header.set_cksum();
743    header
744}
745
746fn prepare_header_path(dst: &mut dyn Write, header: &mut Header, path: &Path) -> io::Result<()> {
747    // Try to encode the path directly in the header, but if it ends up not
748    // working (probably because it's too long) then try to use the GNU-specific
749    // long name extension by emitting an entry which indicates that it's the
750    // filename.
751    if let Err(e) = header.set_path(path) {
752        let data = path2bytes(&path)?;
753        let max = header.as_old().name.len();
754        // Since `e` isn't specific enough to let us know the path is indeed too
755        // long, verify it first before using the extension.
756        if data.len() < max {
757            return Err(e);
758        }
759        let header2 = prepare_header(data.len() as u64, b'L');
760        // null-terminated string
761        let mut data2 = data.chain(io::repeat(0).take(1));
762        append(dst, &header2, &mut data2)?;
763
764        // Truncate the path to store in the header we're about to emit to
765        // ensure we've got something at least mentioned. Note that we use
766        // `str`-encoding to be compatible with Windows, but in general the
767        // entry in the header itself shouldn't matter too much since extraction
768        // doesn't look at it.
769        let truncated = match str::from_utf8(&data[..max]) {
770            Ok(s) => s,
771            Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
772        };
773        header.set_truncated_path_for_gnu_header(&truncated)?;
774    }
775    Ok(())
776}
777
778fn prepare_header_link(
779    dst: &mut dyn Write,
780    header: &mut Header,
781    link_name: &Path,
782) -> io::Result<()> {
783    // Same as previous function but for linkname
784    if let Err(e) = header.set_link_name(&link_name) {
785        let data = path2bytes(&link_name)?;
786        if data.len() < header.as_old().linkname.len() {
787            return Err(e);
788        }
789        let header2 = prepare_header(data.len() as u64, b'K');
790        let mut data2 = data.chain(io::repeat(0).take(1));
791        append(dst, &header2, &mut data2)?;
792    }
793    Ok(())
794}
795
796fn prepare_header_sparse(
797    file: &mut fs::File,
798    stat: &fs::Metadata,
799    header: &mut Header,
800) -> io::Result<Option<SparseEntries>> {
801    let entries = match find_sparse_entries(file, stat)? {
802        Some(entries) => entries,
803        _ => return Ok(None),
804    };
805
806    header.set_entry_type(EntryType::GNUSparse);
807    header.set_size(entries.on_disk_size);
808
809    // Write the first 4 (GNU_SPARSE_HEADERS_COUNT) entries to the given header.
810    // The remaining entries will be written as subsequent extended headers. See
811    // https://www.gnu.org/software/tar/manual/html_section/Sparse-Formats.html#Old-GNU-Format
812    // for details on the format.
813    let gnu_header = &mut header.as_gnu_mut().unwrap();
814    gnu_header.set_real_size(entries.size());
815
816    for (entry, header_entry) in std::iter::zip(&entries.entries, &mut gnu_header.sparse) {
817        header_entry.set_offset(entry.offset);
818        header_entry.set_length(entry.num_bytes);
819    }
820    gnu_header.set_is_extended(entries.entries.len() > gnu_header.sparse.len());
821
822    Ok(Some(entries))
823}
824
825/// Write extra sparse headers into `dst` for those entries that did not fit in the main header.
826fn append_extended_sparse_headers(dst: &mut dyn Write, entries: &SparseEntries) -> io::Result<()> {
827    // The first `GNU_SPARSE_HEADERS_COUNT` entries are written to the main header, so skip them.
828    let mut it = entries
829        .entries
830        .iter()
831        .skip(GNU_SPARSE_HEADERS_COUNT)
832        .peekable();
833
834    // Each GnuExtSparseHeader can hold up to fixed number of sparse entries (21).
835    // So we pack entries into multiple headers if necessary.
836    while it.peek().is_some() {
837        let mut ext_header = GnuExtSparseHeader::new();
838        for header_entry in ext_header.sparse.iter_mut() {
839            if let Some(entry) = it.next() {
840                header_entry.set_offset(entry.offset);
841                header_entry.set_length(entry.num_bytes);
842            } else {
843                break;
844            }
845        }
846        ext_header.set_is_extended(it.peek().is_some());
847        dst.write_all(ext_header.as_bytes())?;
848    }
849
850    Ok(())
851}
852
853fn append_fs(
854    dst: &mut dyn Write,
855    path: &Path,
856    meta: &fs::Metadata,
857    mode: HeaderMode,
858    link_name: Option<&Path>,
859) -> io::Result<()> {
860    let mut header = Header::new_gnu();
861
862    prepare_header_path(dst, &mut header, path)?;
863    header.set_metadata_in_mode(meta, mode);
864    if let Some(link_name) = link_name {
865        prepare_header_link(dst, &mut header, link_name)?;
866    }
867    header.set_cksum();
868    dst.write_all(header.as_bytes())
869}
870
871fn append_dir_all(
872    dst: &mut dyn Write,
873    path: &Path,
874    src_path: &Path,
875    options: BuilderOptions,
876) -> io::Result<()> {
877    let mut stack = vec![(src_path.to_path_buf(), true, false)];
878    while let Some((src, is_dir, is_symlink)) = stack.pop() {
879        let dest = path.join(src.strip_prefix(&src_path).unwrap());
880        // In case of a symlink pointing to a directory, is_dir is false, but src.is_dir() will return true
881        if is_dir || (is_symlink && options.follow && src.is_dir()) {
882            for entry in fs::read_dir(&src)? {
883                let entry = entry?;
884                let file_type = entry.file_type()?;
885                stack.push((entry.path(), file_type.is_dir(), file_type.is_symlink()));
886            }
887            if dest != Path::new("") {
888                append_dir(dst, &dest, &src, options)?;
889            }
890        } else if !options.follow && is_symlink {
891            let stat = fs::symlink_metadata(&src)?;
892            let link_name = fs::read_link(&src)?;
893            append_fs(dst, &dest, &stat, options.mode, Some(&link_name))?;
894        } else {
895            #[cfg(unix)]
896            {
897                let stat = fs::metadata(&src)?;
898                if !stat.is_file() {
899                    append_special(dst, &dest, &stat, options.mode)?;
900                    continue;
901                }
902            }
903            append_file(dst, &dest, &mut fs::File::open(src)?, options)?;
904        }
905    }
906    Ok(())
907}
908
909#[derive(Debug, Clone, PartialEq, Eq)]
910struct SparseEntries {
911    entries: Vec<SparseEntry>,
912    on_disk_size: u64,
913}
914
915impl SparseEntries {
916    fn size(&self) -> u64 {
917        self.entries.last().map_or(0, |e| e.offset + e.num_bytes)
918    }
919}
920
921#[derive(Debug, Copy, Clone, PartialEq, Eq)]
922struct SparseEntry {
923    offset: u64,
924    num_bytes: u64,
925}
926
927/// Find sparse entries in a file. Returns:
928/// * `Ok(Some(_))` if the file is sparse.
929/// * `Ok(None)` if the file is not sparse, or if the file system does not
930///    support sparse files.
931/// * `Err(_)` if an error occurred. The lack of support for sparse files is not
932///    considered an error. It might return an error if the file is modified
933///    while reading.
934fn find_sparse_entries(
935    file: &mut fs::File,
936    stat: &fs::Metadata,
937) -> io::Result<Option<SparseEntries>> {
938    #[cfg(not(any(target_os = "android", target_os = "freebsd", target_os = "linux")))]
939    {
940        let _ = file;
941        let _ = stat;
942        Ok(None)
943    }
944
945    #[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))]
946    find_sparse_entries_seek(file, stat)
947}
948
949/// Implementation of `find_sparse_entries` using `SEEK_HOLE` and `SEEK_DATA`.
950#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))]
951fn find_sparse_entries_seek(
952    file: &mut fs::File,
953    stat: &fs::Metadata,
954) -> io::Result<Option<SparseEntries>> {
955    use std::os::unix::fs::MetadataExt as _;
956    use std::os::unix::io::AsRawFd as _;
957
958    fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result<i64, i32> {
959        #[cfg(any(target_os = "linux", target_os = "android"))]
960        let lseek = libc::lseek64;
961        #[cfg(not(any(target_os = "linux", target_os = "android")))]
962        let lseek = libc::lseek;
963
964        match unsafe { lseek(file.as_raw_fd(), offset, whence) } {
965            -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()),
966            off => Ok(off),
967        }
968    }
969
970    if stat.blocks() == 0 {
971        return Ok(if stat.size() == 0 {
972            // Empty file.
973            None
974        } else {
975            // Fully sparse file.
976            Some(SparseEntries {
977                entries: vec![SparseEntry {
978                    offset: stat.size(),
979                    num_bytes: 0,
980                }],
981                on_disk_size: 0,
982            })
983        });
984    }
985
986    // On most Unices, we need to read `_PC_MIN_HOLE_SIZE` to see if the file
987    // system supports `SEEK_HOLE`.
988    // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE
989    #[cfg(not(any(target_os = "linux", target_os = "android")))]
990    if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 {
991        return Ok(None);
992    }
993
994    // Linux is the only UNIX-like without support for `_PC_MIN_HOLE_SIZE`, so
995    // instead we try to call `lseek` and see if it fails.
996    #[cfg(any(target_os = "linux", target_os = "android"))]
997    match lseek(file, 0, libc::SEEK_HOLE) {
998        Ok(_) => (),
999        Err(libc::ENXIO) => {
1000            // The file is empty. Treat it as non-sparse.
1001            return Ok(None);
1002        }
1003        Err(_) => return Ok(None),
1004    }
1005
1006    let mut entries = Vec::new();
1007    let mut on_disk_size = 0;
1008    let mut off_s = 0;
1009    loop {
1010        //  off_s=0      │     off_s               │ off_s
1011        //    ↓          │       ↓                 │   ↓
1012        //    | DATA |…  │  ……………| HOLE | DATA |…  │  …|×EOF×
1013        //    ↑          │       ↑      ↑          │
1014        //   (a)         │  (b) (c)    (d)         │     (e)
1015        match lseek(file, off_s, libc::SEEK_DATA) {
1016            Ok(0) if off_s == 0 => (), // (a) The file starts with data.
1017            Ok(off) if off < off_s => {
1018                // (b) Unlikely.
1019                return Err(std::io::Error::new(
1020                    io::ErrorKind::Other,
1021                    "lseek(SEEK_DATA) went backwards",
1022                ));
1023            }
1024            Ok(off) if off == off_s => {
1025                // (c) The data at the same offset as the hole.
1026                return Err(std::io::Error::new(
1027                    io::ErrorKind::Other,
1028                    "lseek(SEEK_DATA) did not advance. \
1029                     Did the file change while appending?",
1030                ));
1031            }
1032            Ok(off) => off_s = off,    // (d) Jump to the next hole.
1033            Err(libc::ENXIO) => break, // (e) Reached the end of the file.
1034            Err(errno) => return Err(io::Error::from_raw_os_error(errno)),
1035        };
1036
1037        // off_s=0          │     off_s               │    off_s
1038        //   ↓              │       ↓                 │      ↓
1039        //   | DATA |×EOF×  │  ……………| DATA | HOLE |…  │  …|×EOF×
1040        //          ↑       │       ↑      ↑          │
1041        //         (a)      │  (b) (c)    (d)         │     (e)
1042        match lseek(file, off_s, libc::SEEK_HOLE) {
1043            Ok(off_e) if off_s == 0 && (off_e as u64) == stat.size() => {
1044                // (a) The file is not sparse.
1045                file.seek(io::SeekFrom::Start(0))?;
1046                return Ok(None);
1047            }
1048            Ok(off_e) if off_e < off_s => {
1049                // (b) Unlikely.
1050                return Err(std::io::Error::new(
1051                    io::ErrorKind::Other,
1052                    "lseek(SEEK_HOLE) went backwards",
1053                ));
1054            }
1055            Ok(off_e) if off_e == off_s => {
1056                // (c) The hole at the same offset as the data.
1057                return Err(std::io::Error::new(
1058                    io::ErrorKind::Other,
1059                    "lseek(SEEK_HOLE) did not advance. \
1060                     Did the file change while appending?",
1061                ));
1062            }
1063            Ok(off_e) => {
1064                // (d) Found a hole or reached the end of the file (implicit
1065                // zero-length hole).
1066                entries.push(SparseEntry {
1067                    offset: off_s as u64,
1068                    num_bytes: off_e as u64 - off_s as u64,
1069                });
1070                on_disk_size += off_e as u64 - off_s as u64;
1071                off_s = off_e;
1072            }
1073            Err(libc::ENXIO) => {
1074                // (e) off_s was already beyond the end of the file.
1075                return Err(std::io::Error::new(
1076                    io::ErrorKind::Other,
1077                    "lseek(SEEK_HOLE) returned ENXIO. \
1078                     Did the file change while appending?",
1079                ));
1080            }
1081            Err(errno) => return Err(io::Error::from_raw_os_error(errno)),
1082        };
1083    }
1084
1085    if off_s as u64 > stat.size() {
1086        return Err(std::io::Error::new(
1087            io::ErrorKind::Other,
1088            "lseek(SEEK_DATA) went beyond the end of the file. \
1089             Did the file change while appending?",
1090        ));
1091    }
1092
1093    // Add a final zero-length entry. It is required if the file ends with a
1094    // hole, and redundant otherwise. However, we add it unconditionally to
1095    // mimic GNU tar behavior.
1096    entries.push(SparseEntry {
1097        offset: stat.size(),
1098        num_bytes: 0,
1099    });
1100
1101    file.seek(io::SeekFrom::Start(0))?;
1102
1103    Ok(Some(SparseEntries {
1104        entries,
1105        on_disk_size,
1106    }))
1107}
1108
1109impl<W: Write> Drop for Builder<W> {
1110    fn drop(&mut self) {
1111        let _ = self.finish();
1112    }
1113}
1114
1115#[cfg(test)]
1116mod tests {
1117    use super::*;
1118
1119    /// Should be multiple of 4KiB on ext4, multiple of 32KiB on FreeBSD/UFS, multiple of 64KiB on
1120    /// ppc64el
1121    const SPARSE_BLOCK_SIZE: u64 = 64 * 1024;
1122
1123    #[test]
1124    fn test_find_sparse_entries() {
1125        let cases: &[(&str, &[SparseEntry])] = &[
1126            ("|", &[]),
1127            (
1128                "|    |    |    |    |",
1129                &[SparseEntry {
1130                    offset: 4 * SPARSE_BLOCK_SIZE,
1131                    num_bytes: 0,
1132                }],
1133            ),
1134            (
1135                "|####|####|####|####|",
1136                &[
1137                    SparseEntry {
1138                        offset: 0,
1139                        num_bytes: 4 * SPARSE_BLOCK_SIZE,
1140                    },
1141                    SparseEntry {
1142                        offset: 4 * SPARSE_BLOCK_SIZE,
1143                        num_bytes: 0,
1144                    },
1145                ],
1146            ),
1147            (
1148                "|####|####|    |    |",
1149                &[
1150                    SparseEntry {
1151                        offset: 0,
1152                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1153                    },
1154                    SparseEntry {
1155                        offset: 4 * SPARSE_BLOCK_SIZE,
1156                        num_bytes: 0,
1157                    },
1158                ],
1159            ),
1160            (
1161                "|    |    |####|####|",
1162                &[
1163                    SparseEntry {
1164                        offset: 2 * SPARSE_BLOCK_SIZE,
1165                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1166                    },
1167                    SparseEntry {
1168                        offset: 4 * SPARSE_BLOCK_SIZE,
1169                        num_bytes: 0,
1170                    },
1171                ],
1172            ),
1173            (
1174                "|####|    |####|    |",
1175                &[
1176                    SparseEntry {
1177                        offset: 0,
1178                        num_bytes: SPARSE_BLOCK_SIZE,
1179                    },
1180                    SparseEntry {
1181                        offset: 2 * SPARSE_BLOCK_SIZE,
1182                        num_bytes: SPARSE_BLOCK_SIZE,
1183                    },
1184                    SparseEntry {
1185                        offset: 4 * SPARSE_BLOCK_SIZE,
1186                        num_bytes: 0,
1187                    },
1188                ],
1189            ),
1190            (
1191                "|####|    |    |####|",
1192                &[
1193                    SparseEntry {
1194                        offset: 0,
1195                        num_bytes: SPARSE_BLOCK_SIZE,
1196                    },
1197                    SparseEntry {
1198                        offset: 3 * SPARSE_BLOCK_SIZE,
1199                        num_bytes: SPARSE_BLOCK_SIZE,
1200                    },
1201                    SparseEntry {
1202                        offset: 4 * SPARSE_BLOCK_SIZE,
1203                        num_bytes: 0,
1204                    },
1205                ],
1206            ),
1207            (
1208                "|    |####|####|    |",
1209                &[
1210                    SparseEntry {
1211                        offset: 1 * SPARSE_BLOCK_SIZE,
1212                        num_bytes: 2 * SPARSE_BLOCK_SIZE,
1213                    },
1214                    SparseEntry {
1215                        offset: 4 * SPARSE_BLOCK_SIZE,
1216                        num_bytes: 0,
1217                    },
1218                ],
1219            ),
1220        ];
1221
1222        let mut file = tempfile::tempfile().unwrap();
1223
1224        for &(description, map) in cases {
1225            file.set_len(0).unwrap();
1226            file.set_len(map.last().map_or(0, |e| e.offset + e.num_bytes))
1227                .unwrap();
1228
1229            for e in map {
1230                file.seek(io::SeekFrom::Start(e.offset)).unwrap();
1231                for _ in 0..e.num_bytes / SPARSE_BLOCK_SIZE {
1232                    file.write_all(&[0xFF; SPARSE_BLOCK_SIZE as usize]).unwrap();
1233                }
1234            }
1235
1236            let expected = match map {
1237                // Empty file.
1238                &[] => None,
1239
1240                // 100% dense.
1241                &[SparseEntry {
1242                    offset: 0,
1243                    num_bytes: x1,
1244                }, SparseEntry {
1245                    offset: x2,
1246                    num_bytes: 0,
1247                }] if x1 == x2 => None,
1248
1249                // Sparse.
1250                map => Some(SparseEntries {
1251                    entries: map.to_vec(),
1252                    on_disk_size: map.iter().map(|e| e.num_bytes).sum(),
1253                }),
1254            };
1255
1256            let stat = file.metadata().unwrap();
1257            let reported = find_sparse_entries(&mut file, &stat).unwrap();
1258
1259            // Loose check: we did not miss any data blocks.
1260            if let Err(e) = loose_check_sparse_entries(reported.as_ref(), expected.as_ref()) {
1261                panic!(
1262                    "Case: {description}\n\
1263                     Reported: {reported:?}\n\
1264                     Expected: {expected:?}\n\
1265                     Error: {e}",
1266                );
1267            }
1268
1269            // On Linux, always do a strict check. Skip on FreeBSD, as on UFS
1270            // the last block is always dense, even if it's zero-filled.
1271            #[cfg(any(target_os = "android", target_os = "linux"))]
1272            assert_eq!(reported, expected, "Case: {description}");
1273        }
1274    }
1275
1276    fn loose_check_sparse_entries(
1277        reported: Option<&SparseEntries>,
1278        expected: Option<&SparseEntries>,
1279    ) -> Result<(), &'static str> {
1280        let reported = match reported {
1281            Some(entries) => entries, // Reported as sparse.
1282            // It's not an error to report a sparse file as non-sparse.
1283            None => return Ok(()),
1284        };
1285        let expected = match expected {
1286            Some(entries) => entries,
1287            None => return Err("Expected dense file, but reported as sparse"),
1288        };
1289
1290        // Check that we didn't miss any data blocks. However, reporting some
1291        // holes as data is not an error during the loose check.
1292        if expected.entries.iter().any(|e| {
1293            !reported
1294                .entries
1295                .iter()
1296                .any(|r| e.offset >= r.offset && e.offset + e.num_bytes <= r.offset + r.num_bytes)
1297        }) {
1298            return Err("Reported is not a superset of expected");
1299        }
1300
1301        if reported.entries.last() != expected.entries.last() {
1302            return Err("Last zero-length entry is not as expected");
1303        }
1304
1305        // Check invariants of SparseEntries.
1306        let mut prev_end = None;
1307        for e in &reported.entries[..reported.entries.len()] {
1308            if prev_end.map_or(false, |p| e.offset < p) {
1309                return Err("Overlapping or unsorted entries");
1310            }
1311            prev_end = Some(e.offset + e.num_bytes);
1312        }
1313
1314        if reported.on_disk_size != reported.entries.iter().map(|e| e.num_bytes).sum() {
1315            return Err("Incorrect on-disk size");
1316        }
1317
1318        Ok(())
1319    }
1320}