guppy/graph/
cycles.rs

1// Copyright (c) The cargo-guppy Contributors
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Code for handling cycles in dependency graphs.
5//!
6//! See [`Cycles`][] for detailed docs.
7
8use crate::{
9    graph::{PackageGraph, PackageIx},
10    petgraph_support::scc::Sccs,
11    Error, PackageId,
12};
13
14/// Contains information about dependency cycles.
15///
16/// More accurately, information about Strongly Connected Components with 2 or more elements.
17/// Constructed through `PackageGraph::cycles`.
18///
19/// This page includes a bunch of detailed information on cycles, but here's the TLDR:
20///
21/// * Yes, cycles can happen
22/// * Cycles only happen with dev-dependencies
23/// * These cycles have properties that make them easy to handle
24/// * We handle this in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`]
25/// * As a result, you probably don't actually need this module
26///
27/// The slighly more detailed summary is that any graph of "packages" is conflating
28/// the "real" package with its tests, which are actually separate binaries. These
29/// tests *always* depend on the "real" package, and if we bothered to encode that
30/// then any package with tests would have a cyclic dependency on itself -- so we
31/// don't encode that. Unfortunately dev-dependencies allow tests to *indirectly*
32/// depend on the "real" package, creating a cycle you *do* see.
33///
34/// If you only care about "real" builds, you can simply ignore the dev-dependency
35/// edges and restore a nice and simple DAG that can be topologically sorted. This is what
36/// we do for you in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`].
37///
38/// If you care about tests and dev-dependencies, we recommend treating those as
39/// different from the "real" ones (essentially desugarring the package into two nodes).
40/// Because all dev builds are roots of the package graph (nothing depends on a test/benchmark),
41/// they can always go at the start/end (depending on direction) of the topological sort.
42/// This means you can just do add a second loop before/after the "real" one.
43///
44/// For instance, here's a simple program that recursively computes some property of packages
45/// (here "whether serde is a transitive dependency"):
46///
47/// ```
48/// use guppy::{CargoMetadata, graph::DependencyDirection};
49/// use std::collections::HashMap;
50///
51/// let metadata = CargoMetadata::parse_json(include_str!("../../../fixtures/small/metadata1.json")).unwrap();
52/// let package_graph = metadata.build_graph().unwrap();
53/// let workspace_members = package_graph.resolve_workspace();
54/// let dependency_graph = package_graph.query_workspace().resolve();
55///
56/// // Whether the "real" package uses serde
57/// let mut package_uses_serde = HashMap::new();
58/// // Whether the "dev" package uses serde
59/// let mut dev_package_uses_serde = HashMap::new();
60///
61/// // Iterate over packages in reverse topo order (process dependencies first)
62/// for package in dependency_graph.packages(DependencyDirection::Reverse) {
63///     // A package uses serde if...
64///     let uses_serde = if package.name() == "serde" {
65///         // It is literally serde (base case)
66///         true
67///     } else {
68///         // It has a non-dev-dependency on a package which uses serde
69///         // (dev-dependencies handled in the second loop)
70///         package.direct_links().any(|link| {
71///             !link.dev_only() && package_uses_serde[link.to().id()]
72///         })
73///     };
74///     // Record this package's result
75///     package_uses_serde.insert(package.id(), uses_serde);
76/// }
77///
78/// // Now iterate over the workspace members to handle their tests (if any)
79/// // Note that DependencyDirection doesn't matter here, we're literally
80/// // just looping over every workspace member in arbitrary order!
81/// for package in workspace_members.packages(DependencyDirection::Reverse) {
82///     // Check dev-packages using the "real" package results for all links!
83///     let uses_serde = package.direct_links().any(|link| {
84///         package_uses_serde[link.to().id()]
85///     });
86///     // Record this dev-package's result
87///     dev_package_uses_serde.insert(package.id(), uses_serde);
88/// }
89///
90/// // Now we have all the values computed!
91/// for (id, &uses_serde) in &package_uses_serde {
92///     if uses_serde {
93///         let name = package_graph.metadata(id).unwrap().name();
94///         println!("{name} uses serde!");
95///     }
96/// }
97/// for (id, &uses_serde) in &dev_package_uses_serde {
98///     if uses_serde {
99///         let name = package_graph.metadata(id).unwrap().name();
100///         println!("{name}'s tests use serde!");
101///     }
102/// }
103/// ```
104///
105///
106///
107///
108///
109/// # Why Cargo Dependency Graphs Have Cycles
110///
111/// Dependency graphs are generally Directed Acyclic Graphs (DAGs), where each package
112/// is a node and each dependency is an edge. These graphs are acyclic (contain no cycles)
113/// because anything else would be a paradox -- how do you build X if it depends on itself?
114/// You don't!
115///
116/// So why does this API exist? It wouldn't make sense for Cargo to have cycles!
117///
118/// The problem is that "the Cargo dependency graph" is actually two different graphs
119/// at different levels of abstraction: The Package Graph (Guppy, cargo-metadata), and
120/// The Build Graph (Cargo's internals). These two graphs are different because each
121/// package is actually a bunch of different
122/// [build targets in a trenchcoat][`crate::graph::PackageMetadata::build_targets`] -- libs,
123/// bins, tests, benches, and so on. In The Build Graph these different build targets get
124/// their own nodes. In The Package Graph all those targets gets merged together into one
125/// big node. The Build Graph is always a proper DAG, but The Package Graph can have cycles.
126///
127/// Thankfully these cycles can only be created by one specific (and rare) situation:
128/// dev-dependencies. **A test/bench target for a package is allowed to indirectly
129/// depend on the same package's lib/bin target, and this creates apparent cycles
130/// in the package graph!** That's it!
131///
132/// As we'll see, **simply ignoring all dev-dependency edges eliminates all cycles
133/// *and* preserves the ordering constraints of the dependency graph.**
134///
135///
136///
137/// # An Example Cyclic Workspace
138///
139/// As a concrete example, consider [the serde workspace][serde_github], which
140/// actually has this "problem": there's a "cycle" between serde and serde_derive.
141/// In normal builds this cycle doesn't exist: serde_derive is actually a standalone
142/// crate, while [serde (optionally) pulls in serde_derive as a dependency][serde_toml].
143/// The "cycle" only appears when testing serde_derive: [serde_derive's tests quite
144/// reasonably depend on serde][serde_derive_toml] to test the proc-macro's output,
145/// creating a cycle!
146///
147/// The way to resolve this monstrosity is to realize that the tests for serde_derive
148/// are actually a completely different binary from the serde_derive *library*. Let's
149/// call those tests serde_derive_dev. So although the (Package) graph reported by Guppy
150/// (and cargo-metadata) looks like a cycle:
151///
152/// ```text
153/// serde <-----+
154///   |         |
155///   |         |
156///   +--> serde_derive
157/// ```
158///
159/// In actuality, serde_derive_dev breaks the cycle and creates a nice clean DAG
160/// (in The Build Graph):
161///
162/// ```text
163///   +-- serde_derive_dev
164///   |          |
165///   v          |
166/// serde        |
167///   |          |
168///   |          v
169///   +---> serde_derive
170/// ```
171///
172/// Here's the really important thing to notice: serde_derive_dev is actually a *root*
173/// in The Build Graph, and this is always true! Nothing should ever depend on the *tests*
174/// or *benchmarks* for another library.
175///
176/// This is the key insight to ignoring dev-dependency edges. As we'll see, the roots
177/// (and leaves) of a DAG are in some sense "ignorable" by the rest of the graph,
178/// because they can't change the ordering constraints between other packages.
179///
180///
181///
182/// # Topological Sort Is Great (And Composable)
183///
184/// Now that we understand *why* cycles can happen in the package graph, let's look at
185/// what those cycles mess up, and how to deal with them.
186///
187/// One of the big reasons everyone loves DAGs is because you can get a Topological
188/// Sort of them. Topological Sort
189/// (with [`DependencyDirection::Forward`][`crate::graph::DependencyDirection::Forward`])
190/// is just a fancy way of saying "a list where packages always appear before their dependencies"
191/// (vice-versa for [`DependencyDirection::Reverse`][`crate::graph::DependencyDirection::Reverse`]).
192///
193/// This is really convenient! If you need to do things in "dependency order" you can just
194/// topologically sort the packages and then boring old for-loops will magically get
195/// everything done before it's needed.
196///
197/// Unfortunately, you can't get the Topological Sort of a graph with cycles because that
198/// doesn't make sense. And yet, Guppy has
199/// [several APIs which do exactly that][`crate::graph::PackageSet::packages`].
200/// What gives? The docs say:
201///
202/// > The packages within a dependency cycle will be returned in non-dev order. When the
203/// > direction is forward, if package Foo has a dependency on Bar, and Bar has a cyclic
204/// > dev-dependency on Foo, then Foo is returned before Bar.
205///
206/// We just ignore the dev-dependency edges! Problem Solved.
207///
208/// But isn't this throwing out important information that could change the result? Nope!
209///
210/// As we saw in the previous section, all dev-builds are roots in The Build Graph.
211/// Ignoring all dev-dependency edges is equivalent to deleting all of those roots.
212/// This may "orphan" dependencies that are only used for dev-builds, but we still
213/// keep them in the graph and properly include them in the sort.
214///
215/// As it turns out, you can recursively compute the topological sort of a graph as follows:
216///
217/// 1. delete a root (or leaf)
218/// 2. compute the topological sort of the new graph
219/// 3. append the root (or leaf) to the start (or end) of the list
220///
221/// **Even although we delete all the dev-nodes from the graph when doing our sort,
222/// if you want to "add them back" the only thing you need to do is handle them before
223/// (or after) everything else!** Even better: all the dev-builds are roots at the same
224/// time, so you can process them in any order!
225///
226/// Just remember that every node with dev-dependencies is really two nodes: the "normal"
227/// version without dev-dependencies, and the version with them. Exactly how you want
228/// to express that notion in your code is up to you. (Two different loops is the simplest.)
229///
230///
231///
232///
233/// # Reasoning About Cycles: Strongly Connected Components
234///
235/// Ok but wait, none of that involved Strongly Connected Components! Yeah, isn't that great? 😄
236///
237/// Oh you still want to "know" about the cycles? Then we've gotta bust out the heavy
238/// general-purpose machinery. Thankfully the problem of cycles in directed graphs is
239/// an old and well-studied problem with a conceptually simple solution: hide the cycle
240/// in a box and pretend that it's just one Really Big Node in the DAG.
241///
242/// Yes, really, that's all that Strongly Connected Components are. More precisely, SCCs
243/// are defined to be maximal sets of nodes such that "every node in an SCC can reach
244/// every other node in that SCC" (a property which definitely holds for cycles).
245/// The reason for this more complicated definition is that you can have a bunch of
246/// cycles all knotted together in a nasty ball, and trying to tease out individual
247/// cycles isn't really helpful. So we just wrap the whole ball of nodes up into one
248/// big "I give up" box and forget about it!
249///
250/// Now, what does this get us?
251///
252/// The graph *between* Strongly Connected Components is *always* a DAG, so you can
253/// always topologically sort *that*. In really nasty cases this is just vacuously
254/// true (all the nodes end up in one SCC, and so the "Graph of SCCs" is just one big
255/// unsorted node). On the other hand, if the graph already *is* a DAG then each node
256/// is its own SCC, and so we lose nothing. In this way SCCs give us a way to preserve
257/// all the *nice* parts of our graph while also isolating the problematic parts
258/// (SCCs with more than 1 node) to something self-contained that we can handle specially.
259///
260/// In the general case, nothing more can be done to order an SCC. By definition every
261/// node depends on every other node! But as we've seen in the previous section, there
262/// actually *is* a good way to order packages even with cycles, and so we maintain
263/// that ordering for our SCCs: it's just the topological sort with all the
264/// dev-dependencies ignored.
265///
266///
267///
268///
269/// [serde_github]: https://github.com/serde-rs/serde
270/// [serde_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde/Cargo.toml#L17-L18
271/// [serde_derive_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde_derive/Cargo.toml#L29-L30
272pub struct Cycles<'g> {
273    package_graph: &'g PackageGraph,
274    sccs: &'g Sccs<PackageIx>,
275}
276
277impl<'g> Cycles<'g> {
278    pub(super) fn new(package_graph: &'g PackageGraph) -> Self {
279        Self {
280            package_graph,
281            sccs: package_graph.sccs(),
282        }
283    }
284
285    /// Returns true if these two IDs are in the same cycle.
286    ///
287    /// This is equivalent to checking if they're in the same Strongly Connected Component.
288    pub fn is_cyclic(&self, a: &PackageId, b: &PackageId) -> Result<bool, Error> {
289        let a_ix = self.package_graph.package_ix(a)?;
290        let b_ix = self.package_graph.package_ix(b)?;
291        Ok(self.sccs.is_same_scc(a_ix, b_ix))
292    }
293
294    /// Returns all the Strongly Connected Components (SCCs) of 2 or more elements in this graph.
295    ///
296    /// SCCs are returned in topological order: if packages in SCC B depend on packages in SCC
297    /// A, A is returned before B.
298    ///
299    /// Within an SCC, nodes are returned in non-dev order: if package Foo has a dependency on Bar,
300    /// and Bar has a cyclic dev-dependency on Foo, then Foo is returned before Bar.
301    ///
302    /// See the type-level docs for details.
303    pub fn all_cycles(&self) -> impl DoubleEndedIterator<Item = Vec<&'g PackageId>> + 'g {
304        let dep_graph = &self.package_graph.dep_graph;
305        self.sccs
306            .multi_sccs()
307            .map(move |scc| scc.iter().map(move |ix| &dep_graph[*ix]).collect())
308    }
309}