diff --git a/Cargo.lock b/Cargo.lock index 05c98210a..4ed16af8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3244,7 +3244,6 @@ dependencies = [ "la-arena", "matchit 0.9.0", "miette", - "num_cpus", "once_cell", "pavex", "pavex_bp_schema", @@ -3252,18 +3251,16 @@ dependencies = [ "pavex_cli_shell", "pavex_rustdoc_types", "pavexc_attr_parser", + "pavexc_rustdoc_cache", "persist_if_changed", "petgraph", "prettyplease", "proc-macro2", "px_workspace_hack", "quote", - "r2d2", - "r2d2_sqlite", "rayon", "relative-path", "rkyv", - "rusqlite", "rustc-hash", "semver", "serde", @@ -3276,8 +3273,6 @@ dependencies = [ "toml_edit 0.24.0+spec-1.1.0", "tracing", "tracing_log_error", - "vergen-gitcl", - "xdg-home", "xxhash-rust", ] @@ -3346,6 +3341,40 @@ dependencies = [ "thiserror 2.0.17", ] +[[package]] +name = "pavexc_rustdoc_cache" +version = "0.2.10" +dependencies = [ + "ahash", + "anyhow", + "bincode", + "camino", + "fs-err", + "globwalk", + "guppy", + "itertools 0.14.0", + "num_cpus", + "once_cell", + "pavex_bp_schema", + "pavex_rustdoc_types", + "pavexc_attr_parser", + "px_workspace_hack", + "r2d2", + "r2d2_sqlite", + "rkyv", + "rusqlite", + "rustc-hash", + "serde", + "serde_json", + "serde_stacker", + "thiserror 2.0.17", + "toml 0.9.10+spec-1.1.0", + "tracing", + "tracing_log_error", + "xdg-home", + "xxhash-rust", +] + [[package]] name = "pear" version = "0.2.9" @@ -3629,6 +3658,7 @@ dependencies = [ "aho-corasick", "base64", "bincode", + "bitflags", "byteorder", "clap", "clap_builder", diff --git a/Cargo.toml b/Cargo.toml index 53bc1d107..f7312389a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ pavex_test_runner = { path = "compiler/pavex_test_runner", version = "0.2.7" } pavexc = { path = "compiler/pavexc", version = "0.2.10" } pavexc_attr_parser = { path = "compiler/pavexc_attr_parser", version = "0.2.10" } pavexc_cli_client = { path = "compiler/pavexc_cli_client", version = "0.2.10" } +pavexc_rustdoc_cache = { path = "compiler/pavexc_rustdoc_cache", version = "0.2.10" } persist_if_changed = { path = "compiler/persist_if_changed", version = "0.2.10" } # Our own fork of `rustdoc-types` to minimise (de)ser overhead. rustdoc-types = { path = "compiler/pavex_rustdoc_types", version = "0.2.10", features = ["rustc-hash"], package = "pavex_rustdoc_types" } diff --git a/compiler/pavexc/Cargo.toml b/compiler/pavexc/Cargo.toml index 82e003855..d090e2f0c 100644 --- a/compiler/pavexc/Cargo.toml +++ b/compiler/pavexc/Cargo.toml @@ -11,8 +11,10 @@ license.workspace = true clippy = { large_enum_variant = "allow", result_large_err = "allow" } [build-dependencies] -vergen-gitcl = { workspace = true } +xxhash-rust = { workspace = true, features = ["xxh64"] } +globwalk = { workspace = true } anyhow = { workspace = true } +toml = { workspace = true } [features] # Enable additional debug assertions to ensure correctness @@ -24,6 +26,7 @@ debug_assertions = [] [dependencies] pavex = { workspace = true } pavexc_attr_parser = { path = "../pavexc_attr_parser", version = "=0.2.10" } +pavexc_rustdoc_cache = { path = "../pavexc_rustdoc_cache", version = "=0.2.10" } pavex_bp_schema = { path = "../pavex_bp_schema", version = "=0.2.10" } pavex_cli_shell = { path = "../pavex_cli_shell", version = "=0.2.10" } pavex_cli_diagnostic = { path = "../pavex_cli_diagnostic", version = "=0.2.10" } @@ -63,19 +66,10 @@ persist_if_changed = { workspace = true } matchit = { workspace = true } relative-path = { workspace = true } camino = { workspace = true } -xxhash-rust = { workspace = true, features = ["xxh64"] } rustc-hash = { workspace = true } -globwalk = { workspace = true } rkyv = { workspace = true } - -# Sqlite cache -xdg-home = { workspace = true } -rusqlite = { workspace = true, features = ["bundled"] } -r2d2_sqlite = { workspace = true } -r2d2 = { workspace = true } bincode = { workspace = true, features = ["serde"] } rayon = { workspace = true } -num_cpus = { workspace = true } px_workspace_hack = { version = "0.1", path = "../../px_workspace_hack" } [dev-dependencies] diff --git a/compiler/pavexc/build.rs b/compiler/pavexc/build.rs index 9974f2f5e..ba42a19ce 100644 --- a/compiler/pavexc/build.rs +++ b/compiler/pavexc/build.rs @@ -1,13 +1,111 @@ -use anyhow::Result; -use vergen_gitcl::{Emitter, GitclBuilder}; +use std::collections::BTreeSet; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; pub fn main() -> Result<()> { - Emitter::default() - .add_instructions( - &GitclBuilder::default() - .describe(true, false, None) - .build()?, - )? - .emit()?; + // Compute checksum of pavexc_rustdoc_cache and its local dependencies. + // This checksum is used as part of the cache fingerprint to ensure + // the cache invalidates when the caching logic or serialized types change. + let base_path = Path::new(env!("CARGO_MANIFEST_DIR")).join(".."); + let cache_crate_path = base_path.join("pavexc_rustdoc_cache"); + + // Find all local crates that pavexc_rustdoc_cache depends on (transitively) + let crates_to_checksum = collect_local_dependencies(&cache_crate_path)?; + + let mut combined_hasher = xxhash_rust::xxh64::Xxh64::new(24); + for crate_path in &crates_to_checksum { + let checksum = checksum_directory(crate_path)?; + combined_hasher.update(&checksum.to_le_bytes()); + + // Rerun if any of these crates change + println!("cargo::rerun-if-changed={}/src", crate_path.display()); + println!("cargo::rerun-if-changed={}/Cargo.toml", crate_path.display()); + } + + let checksum = combined_hasher.digest(); + println!("cargo::rustc-env=RUSTDOC_CACHE_SOURCE_HASH={checksum:x}"); + Ok(()) } + +/// Collect all local path dependencies of a crate, including the crate itself. +/// This is done recursively to capture transitive local dependencies. +fn collect_local_dependencies(crate_path: &Path) -> Result> { + let mut visited = BTreeSet::new(); + let mut to_visit = vec![crate_path.to_path_buf()]; + + while let Some(current) = to_visit.pop() { + let canonical = current + .canonicalize() + .with_context(|| format!("Failed to canonicalize path: {}", current.display()))?; + + if !visited.insert(canonical.clone()) { + continue; + } + + // Parse Cargo.toml to find path dependencies + let cargo_toml_path = canonical.join("Cargo.toml"); + let cargo_toml_content = std::fs::read_to_string(&cargo_toml_path) + .with_context(|| format!("Failed to read {}", cargo_toml_path.display()))?; + + let cargo_toml: toml::Table = toml::from_str(&cargo_toml_content) + .with_context(|| format!("Failed to parse {}", cargo_toml_path.display()))?; + + // Check [dependencies] section for path dependencies + if let Some(toml::Value::Table(deps)) = cargo_toml.get("dependencies") { + for (_name, value) in deps { + if let Some(path) = value.get("path").and_then(|p| p.as_str()) { + let dep_path = canonical.join(path); + if dep_path.exists() { + to_visit.push(dep_path); + } + } + } + } + } + + Ok(visited) +} + +/// Checksum the contents of a crate directory. +fn checksum_directory(root_path: &Path) -> Result { + let paths = get_file_paths(root_path)?; + + let mut hasher = xxhash_rust::xxh64::Xxh64::new(24); + for path in paths { + let contents = std::fs::read(&path) + .with_context(|| format!("Failed to read file at `{}`", path.display()))?; + hasher.update(&contents); + // Include the file path in the hash to detect renames + if let Ok(relative) = path.strip_prefix(root_path) { + hasher.update(relative.to_string_lossy().as_bytes()); + } + } + Ok(hasher.digest()) +} + +/// Get all source files in a crate directory. +fn get_file_paths(root_dir: &Path) -> Result> { + let root_dir = root_dir + .canonicalize() + .context("Failed to canonicalize the path to the root directory")?; + + let patterns = vec!["src/**/*.rs", "Cargo.toml"]; + + let glob_walker = globwalk::GlobWalkerBuilder::from_patterns(&root_dir, &patterns).build()?; + + let included_files: BTreeSet = glob_walker + .into_iter() + .filter_map(|entry| { + let Ok(entry) = entry else { + return None; + }; + if !entry.file_type().is_file() { + return None; + } + Some(entry.into_path()) + }) + .collect(); + Ok(included_files) +} diff --git a/compiler/pavexc/src/compiler/analyses/user_components/annotations/mod.rs b/compiler/pavexc/src/compiler/analyses/user_components/annotations/mod.rs index e97c0c746..1b03b19e9 100644 --- a/compiler/pavexc/src/compiler/analyses/user_components/annotations/mod.rs +++ b/compiler/pavexc/src/compiler/analyses/user_components/annotations/mod.rs @@ -36,7 +36,8 @@ use crate::{ ResolvedType, }, rustdoc::{ - AnnotationCoordinates, Crate, CrateCollection, GlobalItemId, ImplInfo, RustdocKindExt, + AnnotationCoordinates, Crate, CrateCollection, ExternalReExportsExt, GlobalItemId, + ImplInfo, RustdocKindExt, }, }; use pavex_bp_schema::{CloningPolicy, Lifecycle, Lint, LintSetting}; diff --git a/compiler/pavexc/src/rustdoc/annotations/diagnostic.rs b/compiler/pavexc/src/rustdoc/annotations/diagnostic.rs index 42dcb6106..4ff3a8a25 100644 --- a/compiler/pavexc/src/rustdoc/annotations/diagnostic.rs +++ b/compiler/pavexc/src/rustdoc/annotations/diagnostic.rs @@ -6,7 +6,7 @@ use pavex_cli_diagnostic::{AnnotatedSource, CompilerDiagnostic, HelpWithSnippet} use pavexc_attr_parser::{AnnotationKind, errors::AttributeParserError}; use rustdoc_types::Item; -use super::items::IdConflict; +use pavexc_rustdoc_cache::IdConflict; pub(crate) fn invalid_diagnostic_attribute( e: AttributeParserError, diff --git a/compiler/pavexc/src/rustdoc/annotations/mod.rs b/compiler/pavexc/src/rustdoc/annotations/mod.rs index 71c65a9ad..3bc1bcf24 100644 --- a/compiler/pavexc/src/rustdoc/annotations/mod.rs +++ b/compiler/pavexc/src/rustdoc/annotations/mod.rs @@ -1,5 +1,4 @@ mod diagnostic; -mod items; mod parser; mod queue; @@ -12,7 +11,7 @@ use rustdoc_types::{Enum, ItemEnum, Struct, Trait}; use std::collections::BTreeSet; pub(crate) use diagnostic::invalid_diagnostic_attribute; -pub use items::{AnnotatedItem, AnnotatedItems, ImplInfo}; +pub use pavexc_rustdoc_cache::{AnnotatedItem, AnnotatedItems, ImplInfo}; pub(crate) use parser::parse_pavex_attributes; pub(crate) use queue::QueueItem; diff --git a/compiler/pavexc/src/rustdoc/compute/cache.rs b/compiler/pavexc/src/rustdoc/compute/cache.rs index 0aeb9b594..78fcfdf9a 100644 --- a/compiler/pavexc/src/rustdoc/compute/cache.rs +++ b/compiler/pavexc/src/rustdoc/compute/cache.rs @@ -1,62 +1,119 @@ -use std::{borrow::Cow, collections::BTreeSet}; +//! Thin wrapper around `pavexc_rustdoc_cache` that integrates with pavexc's types. -use anyhow::Context; -use camino::Utf8Path; -use guppy::{ - PackageId, - graph::{PackageGraph, PackageMetadata, feature::StandardFeatures}, -}; -use itertools::Itertools; -use r2d2_sqlite::SqliteConnectionManager; +use std::borrow::Cow; + +use guppy::PackageId; +use guppy::graph::PackageGraph; +use rkyv::rancor::Panic; use rkyv::util::AlignedVec; -use rusqlite::{ToSql, params, types::ToSqlOutput}; -use tracing::instrument; -use tracing_log_error::log_error; -use crate::{ - DiagnosticSink, - rustdoc::{ - annotations::AnnotatedItems, - queries::{ - CrateData, CrateItemIndex, CrateItemPaths, ImportPath2Id, LazyCrateItemIndex, - LazyCrateItemPaths, LazyImportPath2Id, - }, - }, +pub use pavexc_rustdoc_cache::{ + CacheEntry, EagerCrateItemIndex, EagerCrateItemPaths, EagerImportPath2Id, RkyvCowBytes, + RustdocCacheKey, RustdocGlobalFsCache, SecondaryIndexes, }; +use pavexc_rustdoc_cache::HydratedCacheEntry as CacheEntryInner; -use super::{checksum::checksum_crate, rustdoc_options}; +use crate::DiagnosticSink; +use crate::rustdoc::queries::CrateCore; -/// A cache for storing and retrieving pre-computed JSON documentation generated by `rustdoc`. -/// -/// The cache is shared across all Pavex projects of the current user. -/// It is stored on disk, in the user home directory, using a SQLite database. -#[derive(Debug, Clone)] -pub(crate) struct RustdocGlobalFsCache { - cargo_fingerprint: String, - third_party_cache: ThirdPartyCrateCache, - toolchain_cache: ToolchainCache, - connection_pool: r2d2::Pool, +/// Extension trait to create `CacheEntry` from `&Crate`. +pub trait CacheEntryExt<'a> { + /// Create a cache entry from a crate, including secondary indexes. + fn from_crate(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error>; + /// Create a raw cache entry from a crate (no secondary indexes). + fn from_crate_raw(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error>; } -pub(crate) enum RustdocCacheKey<'a> { - ThirdPartyCrate(PackageMetadata<'a>), - ToolchainCrate(&'a str), -} +impl<'a> CacheEntryExt<'a> for CacheEntry<'a> { + fn from_crate(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error> { + // Serialize the crate data + let external_crates = bincode::serde::encode_to_vec( + &krate.core.krate.external_crates, + bincode::config::standard(), + )?; -impl std::fmt::Debug for RustdocCacheKey<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - RustdocCacheKey::ThirdPartyCrate(metadata) => f - .debug_struct("ThirdPartyCrate") - .field("id", &metadata.id()) - .field("name", &metadata.name()) - .field("version", &metadata.version()) - .finish(), - RustdocCacheKey::ToolchainCrate(name) => f - .debug_struct("ToolchainCrate") - .field("name", name) - .finish(), - } + // Serialize paths - handle Eager variant + let paths: AlignedVec = match &krate.core.krate.paths { + pavexc_rustdoc_cache::CrateItemPaths::Eager(EagerCrateItemPaths { paths }) => { + rkyv::to_bytes::(paths)? + } + pavexc_rustdoc_cache::CrateItemPaths::Lazy(lazy) => lazy.bytes.clone(), + }; + + // Serialize items - handle Eager variant + let items: AlignedVec = match &krate.core.krate.index { + pavexc_rustdoc_cache::CrateItemIndex::Eager(EagerCrateItemIndex { index }) => { + rkyv::to_bytes::(index)? + } + pavexc_rustdoc_cache::CrateItemIndex::Lazy(lazy) => lazy.bytes.clone(), + }; + + // Serialize import_path2id + let import_path2id: AlignedVec = match &krate.import_path2id { + pavexc_rustdoc_cache::ImportPath2Id::Eager(EagerImportPath2Id(m)) => { + rkyv::to_bytes::(m)? + } + pavexc_rustdoc_cache::ImportPath2Id::Lazy(lazy) => lazy.0.clone(), + }; + + // Serialize other secondary indexes + let import_index = + bincode::serde::encode_to_vec(&krate.import_index, bincode::config::standard())?; + let annotated_items = + bincode::serde::encode_to_vec(&krate.annotated_items, bincode::config::standard())?; + let re_exports = bincode::serde::encode_to_vec( + &krate.external_re_exports, + bincode::config::standard(), + )?; + + let secondary_indexes = SecondaryIndexes { + import_index: Cow::Owned(import_index), + annotated_items: Some(Cow::Owned(annotated_items)), + import_path2id: RkyvCowBytes::Owned(import_path2id), + re_exports: Cow::Owned(re_exports), + }; + + Ok(CacheEntry { + root_item_id: krate.core.krate.root_item_id.0, + external_crates: Cow::Owned(external_crates), + paths: RkyvCowBytes::Owned(paths), + format_version: krate.core.krate.format_version as i64, + items: RkyvCowBytes::Owned(items), + secondary_indexes: Some(secondary_indexes), + }) + } + + fn from_crate_raw(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error> { + // Serialize the crate data + let external_crates = bincode::serde::encode_to_vec( + &krate.core.krate.external_crates, + bincode::config::standard(), + )?; + + // Serialize paths - handle Eager variant + let paths: AlignedVec = match &krate.core.krate.paths { + pavexc_rustdoc_cache::CrateItemPaths::Eager(EagerCrateItemPaths { paths }) => { + rkyv::to_bytes::(paths)? + } + pavexc_rustdoc_cache::CrateItemPaths::Lazy(lazy) => lazy.bytes.clone(), + }; + + // Serialize items - handle Eager variant + let items: AlignedVec = match &krate.core.krate.index { + pavexc_rustdoc_cache::CrateItemIndex::Eager(EagerCrateItemIndex { index }) => { + rkyv::to_bytes::(index)? + } + pavexc_rustdoc_cache::CrateItemIndex::Lazy(lazy) => lazy.bytes.clone(), + }; + + Ok(CacheEntry { + root_item_id: krate.core.krate.root_item_id.0, + external_crates: Cow::Owned(external_crates), + paths: RkyvCowBytes::Owned(paths), + format_version: krate.core.krate.format_version as i64, + items: RkyvCowBytes::Owned(items), + secondary_indexes: None, + }) } } @@ -67,64 +124,107 @@ pub(crate) enum RustdocCacheEntry { /// This happens when the indexing phase emitted one or more diagnostics, /// thus forcing to go through that step (and report those errors) /// every single time we attempt a compilation. - Raw(CrateData), + Raw(CacheEntryInner), /// The cache holds both the raw `rustdoc` output and our secondary indexes. /// It's ready to be used as is! Processed(crate::rustdoc::Crate), } impl RustdocCacheEntry { + /// Convert a cache entry from the rustdoc_cache crate to our internal representation. + pub fn from_cache_inner(inner: CacheEntryInner) -> Self { + match inner { + CacheEntryInner::Raw(crate_data) => RustdocCacheEntry::Raw(CacheEntryInner::Raw(crate_data)), + CacheEntryInner::Processed(processed) => { + let krate = crate::rustdoc::Crate { + core: CrateCore { + package_id: processed.package_id, + krate: processed.crate_data, + }, + import_path2id: processed.import_path2id, + import_index: processed.import_index, + external_re_exports: processed.external_re_exports, + annotated_items: processed.annotated_items, + crate_id2package_id: Default::default(), + }; + RustdocCacheEntry::Processed(krate) + } + } + } + pub fn process(self, package_id: PackageId, sink: &DiagnosticSink) -> crate::rustdoc::Crate { match self { - RustdocCacheEntry::Raw(crate_data) => { - crate::rustdoc::Crate::index(crate_data, package_id, sink) + RustdocCacheEntry::Raw(inner) => { + match inner { + CacheEntryInner::Raw(crate_data) => { + crate::rustdoc::Crate::index(crate_data, package_id, sink) + } + CacheEntryInner::Processed(processed) => { + // This shouldn't happen since we check above, but handle it gracefully + crate::rustdoc::Crate { + core: CrateCore { + package_id: processed.package_id, + krate: processed.crate_data, + }, + import_path2id: processed.import_path2id, + import_index: processed.import_index, + external_re_exports: processed.external_re_exports, + annotated_items: processed.annotated_items, + crate_id2package_id: Default::default(), + } + } + } } RustdocCacheEntry::Processed(c) => c, } } } -static BINCODE_CONFIG: bincode::config::Configuration = bincode::config::standard(); +/// Wrapper around [`RustdocGlobalFsCache`] that integrates with pavexc's caching fingerprint. +pub(crate) struct PavexRustdocCache { + inner: RustdocGlobalFsCache, +} -impl<'a> RustdocCacheKey<'a> { - pub fn new(package_id: &'a PackageId, package_graph: &'a PackageGraph) -> RustdocCacheKey<'a> { - if crate::rustdoc::TOOLCHAIN_CRATES.contains(&package_id.repr()) { - RustdocCacheKey::ToolchainCrate(package_id.repr()) - } else { - RustdocCacheKey::ThirdPartyCrate(package_graph.metadata(package_id).unwrap()) +impl std::fmt::Debug for PavexRustdocCache { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PavexRustdocCache") + .field("inner", &self.inner) + .finish() + } +} + +impl Clone for PavexRustdocCache { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), } } } -impl RustdocGlobalFsCache { +impl PavexRustdocCache { + /// Construct the cache fingerprint for pavexc. + fn cache_fingerprint() -> String { + format!( + "{}-{}", + pavexc_rustdoc_cache::CRATE_VERSION, + env!("RUSTDOC_CACHE_SOURCE_HASH") + ) + } + /// Initialize a new instance of the cache. - #[tracing::instrument(name = "Initialize on-disk rustdoc cache", skip_all)] pub(crate) fn new( toolchain_name: &str, cache_workspace_package_docs: bool, package_graph: &PackageGraph, ) -> Result { - std::thread::scope(|scope| { - let handle = scope.spawn(|| cargo_fingerprint(toolchain_name)); - - let pool = Self::setup_database()?; - let connection = pool.get()?; - let third_party_cache = ThirdPartyCrateCache::new( - &connection, - cache_workspace_package_docs, - package_graph, - )?; - let toolchain_cache = ToolchainCache::new(&connection)?; - let cargo_fingerprint = handle - .join() - .expect("Failed to compute on `cargo`'s fingerprint")?; - Ok(Self { - cargo_fingerprint, - connection_pool: pool, - third_party_cache, - toolchain_cache, - }) - }) + let fingerprint = Self::cache_fingerprint(); + let inner = RustdocGlobalFsCache::new( + &fingerprint, + toolchain_name, + cache_workspace_package_docs, + package_graph, + )?; + Ok(Self { inner }) } /// Retrieve the cached documentation for a given package, if available. @@ -133,973 +233,55 @@ impl RustdocGlobalFsCache { cache_key: &RustdocCacheKey, package_graph: &PackageGraph, ) -> Result, anyhow::Error> { - let connection = self.connection_pool.get()?; - match cache_key { - RustdocCacheKey::ThirdPartyCrate(metadata) => self.third_party_cache.get( - metadata, - &self.cargo_fingerprint, - &connection, - package_graph, - ), - RustdocCacheKey::ToolchainCrate(name) => { - self.toolchain_cache - .get(name, &self.cargo_fingerprint, &connection) - } + match self.inner.get(cache_key, package_graph)? { + Some(entry) => Ok(Some(RustdocCacheEntry::from_cache_inner(entry))), + None => Ok(None), } } - /// Convert the JSON documentation generated by `rustdoc` into the format used by our cache, - /// then store it. - pub(crate) fn convert_and_insert( + /// Store the JSON documentation for a crate in the cache. + pub(crate) fn insert( &self, cache_key: &RustdocCacheKey, - krate: &crate::rustdoc::Crate, - cache_indexes: bool, + cache_entry: CacheEntry, package_graph: &PackageGraph, ) -> Result<(), anyhow::Error> { - let connection = self.connection_pool.get()?; - match cache_key { - RustdocCacheKey::ThirdPartyCrate(metadata) => { - self.third_party_cache.convert_and_insert( - metadata, - krate, - &self.cargo_fingerprint, - &connection, - cache_indexes, - package_graph, - ) - } - RustdocCacheKey::ToolchainCrate(name) => self.toolchain_cache.convert_and_insert( - name, - krate, - &self.cargo_fingerprint, - &connection, - ), - } + self.inner.insert(cache_key, cache_entry, package_graph) } - /// Store the JSON documentation for a crate, which has already been converted to the expected - /// format for caching. - /// - /// This method should be preferred to [`Self::insert`] whenever multi-threading is involved, - /// since [`RustdocGlobalFsCache`] isn't thread-safe, but the conversion into the caching format - /// doesn't require its internals and can therefore be offloaded to another thread without issues. - pub(crate) fn insert( + /// Convert the JSON documentation generated by `rustdoc` into the format used by our cache, + /// then store it. + pub(crate) fn convert_and_insert( &self, cache_key: &RustdocCacheKey, - cache_entry: CacheEntry, + krate: &crate::rustdoc::Crate, + cache_indexes: bool, package_graph: &PackageGraph, ) -> Result<(), anyhow::Error> { - let connection = self.connection_pool.get()?; - match cache_key { - RustdocCacheKey::ThirdPartyCrate(metadata) => { - let Some(cache_key) = self.third_party_cache.cache_key( - metadata, - &self.cargo_fingerprint, - package_graph, - ) else { - return Ok(()); - }; - self.third_party_cache - .insert(cache_key, &connection, cache_entry) - } - RustdocCacheKey::ToolchainCrate(name) => { - self.toolchain_cache - .insert(name, cache_entry, &self.cargo_fingerprint, &connection) - } - } + let cache_entry = if cache_indexes { + ::from_crate(krate) + } else { + ::from_crate_raw(krate) + }?; + self.insert(cache_key, cache_entry, package_graph) } - #[tracing::instrument(skip_all, level = "trace")] /// Persist the list of package IDs that were accessed during the processing of the /// application blueprint for this project. pub(crate) fn persist_access_log( &self, - package_ids: &BTreeSet, + package_ids: &std::collections::BTreeSet, project_fingerprint: &str, ) -> Result<(), anyhow::Error> { - let connection = self.connection_pool.get()?; - - let mut stmt = connection.prepare_cached( - "INSERT INTO project2package_id_access_log ( - project_fingerprint, - package_ids - ) VALUES (?, ?) - ON CONFLICT(project_fingerprint) DO UPDATE SET package_ids=excluded.package_ids; - ", - )?; - stmt.execute(params![ - project_fingerprint, - bincode::encode_to_vec( - package_ids.iter().map(|s| s.repr()).collect_vec(), - BINCODE_CONFIG - )? - ])?; - - Ok(()) + self.inner.persist_access_log(package_ids, project_fingerprint) } - #[tracing::instrument(skip_all, level = "trace")] /// Retrieve the list of package IDs that were accessed during the last time we processed the /// application blueprint for this project. - /// - /// Returns an empty set if no access log is found for the given project fingerprint. pub(crate) fn get_access_log( &self, project_fingerprint: &str, - ) -> Result, anyhow::Error> { - let connection = self.connection_pool.get()?; - - let mut stmt = connection.prepare_cached( - "SELECT package_ids FROM project2package_id_access_log WHERE project_fingerprint = ?", - )?; - let mut rows = stmt.query(params![project_fingerprint])?; - let Some(row) = rows.next()? else { - return Ok(BTreeSet::new()); - }; - - let package_ids: Vec<&str> = - bincode::borrow_decode_from_slice(row.get_ref_unwrap(0).as_bytes()?, BINCODE_CONFIG)?.0; - Ok(package_ids.into_iter().map(PackageId::new).collect()) - } - - /// Initialize the database, creating the file and the relevant tables if they don't exist yet. - fn setup_database() -> Result, anyhow::Error> { - let pavex_fingerprint = - concat!(env!("CARGO_PKG_VERSION"), '-', env!("VERGEN_GIT_DESCRIBE")); - let cache_dir = xdg_home::home_dir() - .ok_or_else(|| anyhow::anyhow!("Failed to get the user's home directory"))? - .join(".pavex/rustdoc/cache"); - fs_err::create_dir_all(&cache_dir).with_context(|| { - format!( - "Failed to create the cache directory at {}", - cache_dir.to_string_lossy() - ) - })?; - - // For the sake of simplicity, we use a different SQLite database for each version of Pavex. - // This ensures that we don't have to worry about schema migrations. - // The cost we pay: the user will have to re-generate the documentation for all their crates - // when they upgrade Pavex. - // We can improve this in the future, if needed. - let cache_path = cache_dir.join(format!("{pavex_fingerprint}.db")); - - #[derive(Debug)] - struct SqlitePragmas; - - impl r2d2::CustomizeConnection for SqlitePragmas { - fn on_acquire(&self, conn: &mut rusqlite::Connection) -> Result<(), rusqlite::Error> { - conn.execute_batch( - // 250MB memory-mapped, more than enough. - "PRAGMA mmap_size=262144000;", - )?; - Ok(()) - } - } - - let manager = SqliteConnectionManager::file(cache_path); - let pool = r2d2::Pool::builder() - .max_size(num_cpus::get() as u32) - .connection_customizer(Box::new(SqlitePragmas)) - .build(manager) - .context("Failed to open/create a SQLite database to store the contents of pavex's rustdoc cache")?; - - let connection = pool.get()?; - connection.execute_batch( - "PRAGMA journal_mode=WAL; - PRAGMA synchronous=NORMAL;", - )?; - connection.execute( - "CREATE TABLE IF NOT EXISTS project2package_id_access_log ( - project_fingerprint TEXT NOT NULL, - package_ids BLOB NOT NULL, - PRIMARY KEY (project_fingerprint) - )", - [], - )?; - - Ok(pool) - } -} - -#[derive(Debug, Clone)] -#[non_exhaustive] -struct ToolchainCache {} - -impl ToolchainCache { - fn new(connection: &rusqlite::Connection) -> Result { - Self::setup_table(connection)?; - Ok(Self {}) - } - - /// Retrieve the cached documentation for a given toolchain crate, if available. - #[instrument(name = "Retrieve cached toolchain docs from disk", - skip_all, - level=tracing::Level::DEBUG, - fields(crate.name = %name) - )] - fn get( - &self, - name: &str, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - ) -> Result, anyhow::Error> { - // Retrieve from rustdoc's output from cache, if available. - let mut stmt = connection.prepare_cached( - "SELECT - root_item_id, - external_crates, - paths, - format_version, - items, - import_index, - import_path2id, - re_exports - FROM rustdoc_toolchain_crates_cache - WHERE name = ? AND cargo_fingerprint = ?", - )?; - - let span = tracing::trace_span!("Execute query"); - let guard = span.enter(); - let mut rows = stmt.query(params![name, cargo_fingerprint])?; - let Some(row) = rows.next()? else { - return Ok(None); - }; - drop(guard); - - let root_item_id = row.get_ref_unwrap(0).as_i64()?.try_into()?; - let external_crates = row.get_ref_unwrap(1).as_bytes()?; - let paths = row.get_ref_unwrap(2).as_bytes()?; - let format_version = row.get_ref_unwrap(3).as_i64()?; - - let items = row.get_ref_unwrap(4).as_bytes()?; - - let import_index = row.get_ref_unwrap(5).as_bytes()?; - let import_path2id = row.get_ref_unwrap(6).as_bytes()?; - let re_exports = row.get_ref_unwrap(7).as_bytes()?; - - let krate = CacheEntry { - root_item_id, - external_crates: Cow::Borrowed(external_crates), - paths: RkyvCowBytes::Borrowed(paths), - format_version, - items: RkyvCowBytes::Borrowed(items), - secondary_indexes: Some(SecondaryIndexes { - import_index: Cow::Borrowed(import_index), - // Standard library crates don't have Pavex annotations. - annotated_items: None, - import_path2id: RkyvCowBytes::Borrowed(import_path2id), - re_exports: Cow::Borrowed(re_exports), - }), - } - .hydrate(PackageId::new(name))?; - - Ok(Some(krate)) - } - - /// Store the JSON documentation for a toolchain crate in the cache. - #[instrument(name = "Cache rustdoc output on disk", skip_all, level=tracing::Level::DEBUG, fields(crate.name = name))] - fn convert_and_insert( - &self, - name: &str, - krate: &crate::rustdoc::Crate, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - ) -> Result<(), anyhow::Error> { - let cache_entry = CacheEntry::new(krate).context("Failed to serialize docs")?; - self.insert(name, cache_entry, cargo_fingerprint, connection) - } - - /// Store the JSON documentation for a toolchain crate in the cache. - #[instrument(name = "Cache rustdoc output on disk", skip_all, level=tracing::Level::DEBUG, fields(crate.name = name))] - fn insert( - &self, - name: &str, - cache_entry: CacheEntry<'_>, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - ) -> Result<(), anyhow::Error> { - let mut stmt = connection.prepare_cached( - "INSERT INTO rustdoc_toolchain_crates_cache ( - name, - cargo_fingerprint, - root_item_id, - external_crates, - paths, - format_version, - items, - import_index, - import_path2id, - re_exports - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - )?; - stmt.execute(params![ - name, - cargo_fingerprint, - cache_entry.root_item_id, - cache_entry.external_crates, - cache_entry.paths, - cache_entry.format_version, - cache_entry.items, - cache_entry - .secondary_indexes - .as_ref() - .expect("Indexing never fails for toolchain crates") - .import_index, - cache_entry - .secondary_indexes - .as_ref() - .expect("Indexing never fails for toolchain crates") - .import_path2id, - cache_entry - .secondary_indexes - .as_ref() - .expect("Indexing never fails for toolchain crates") - .re_exports - ])?; - Ok(()) - } - - fn setup_table(connection: &rusqlite::Connection) -> Result<(), anyhow::Error> { - connection.execute( - "CREATE TABLE IF NOT EXISTS rustdoc_toolchain_crates_cache ( - name TEXT NOT NULL, - cargo_fingerprint TEXT NOT NULL, - root_item_id INTEGER NOT NULL, - external_crates BLOB NOT NULL, - paths BLOB NOT NULL, - format_version INTEGER NOT NULL, - items BLOB NOT NULL, - import_index BLOB NOT NULL, - import_path2id BLOB NOT NULL, - re_exports BLOB NOT NULL, - PRIMARY KEY (name, cargo_fingerprint) - )", - [], - )?; - Ok(()) - } -} - -#[derive(Debug, Clone)] -#[non_exhaustive] -struct ThirdPartyCrateCache { - cache_workspace_packages: bool, -} - -impl ThirdPartyCrateCache { - fn new( - connection: &rusqlite::Connection, - cache_workspace_packages: bool, - package_graph: &PackageGraph, - ) -> Result { - Self::setup_table(connection)?; - // Force the creation of the feature graph ahead of our queries. - // It'll be cached internally by the `package_graph`. - let _ = package_graph.feature_graph(); - Ok(Self { - cache_workspace_packages, - }) - } - - /// Retrieve the cached documentation for a given package, if available. - #[instrument(name = "Retrieve third-party crate docs from disk cache", - skip_all, - level=tracing::Level::DEBUG, - fields(crate.id = %package_metadata.id(), cache_key = tracing::field::Empty, hit = tracing::field::Empty) - )] - fn get( - &self, - package_metadata: &PackageMetadata, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - package_graph: &PackageGraph, - ) -> Result, anyhow::Error> { - fn _get( - package_metadata: &PackageMetadata, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - cache_workspace_packages: bool, - package_graph: &PackageGraph, - ) -> Result, anyhow::Error> { - let Some(cache_key) = ThirdPartyCrateCacheKey::build( - package_graph, - package_metadata, - cargo_fingerprint, - cache_workspace_packages, - ) else { - return Ok(None); - }; - tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key)); - // Retrieve from rustdoc's output from cache, if available. - let mut stmt = connection.prepare_cached( - "SELECT - root_item_id, - external_crates, - paths, - format_version, - items, - import_index, - import_path2id, - re_exports, - annotated_items - FROM rustdoc_3d_party_crates_cache - WHERE crate_name = ? AND - crate_source = ? AND - crate_version = ? AND - crate_hash = ? AND - cargo_fingerprint = ? AND - rustdoc_options = ? AND - default_feature_is_enabled = ? AND - active_named_features = ?", - )?; - let span = tracing::trace_span!("Execute query"); - let guard = span.enter(); - let mut rows = stmt.query(params![ - cache_key.crate_name, - cache_key.crate_source, - cache_key.crate_version, - // `NULL` values are considered to be distinct from all other values - // by SQLite, including other `NULL`s. Therefore we use an empty - // string as a placeholder for `NULL` values. - cache_key.crate_hash.unwrap_or_default(), - cache_key.cargo_fingerprint, - cache_key.rustdoc_options, - cache_key.default_feature_is_enabled, - cache_key.active_named_features - ])?; - let Some(row) = rows.next().context("Failed to fetch next row")? else { - return Ok(None); - }; - drop(guard); - - let root_item_id = row.get_ref_unwrap(0).as_i64()?.try_into()?; - let external_crates = row.get_ref_unwrap(1).as_bytes()?; - let paths = row.get_ref_unwrap(2).as_bytes()?; - let format_version = row.get_ref_unwrap(3).as_i64()?; - let items = row.get_ref_unwrap(4).as_bytes()?; - let import_index = row.get_ref_unwrap(5).as_bytes_or_null()?; - let import_path2id = row.get_ref_unwrap(6).as_bytes_or_null()?; - let re_exports = row.get_ref_unwrap(7).as_bytes_or_null()?; - let annotated_items = row.get_ref_unwrap(8).as_bytes_or_null()?; - - let secondary_indexes = - match (import_index, import_path2id, re_exports, annotated_items) { - ( - Some(import_index), - Some(import_path2id), - Some(re_exports), - Some(annotated_items), - ) => Some(SecondaryIndexes { - import_index: Cow::Borrowed(import_index), - import_path2id: RkyvCowBytes::Borrowed(import_path2id), - re_exports: Cow::Borrowed(re_exports), - annotated_items: Some(Cow::Borrowed(annotated_items)), - }), - _ => None, - }; - - let krate = CacheEntry { - root_item_id, - external_crates: Cow::Borrowed(external_crates), - paths: RkyvCowBytes::Borrowed(paths), - format_version, - items: RkyvCowBytes::Borrowed(items), - secondary_indexes, - } - .hydrate(package_metadata.id().to_owned()) - .context("Failed to re-hydrate the stored docs")?; - - Ok(Some(krate)) - } - let outcome = _get( - package_metadata, - cargo_fingerprint, - connection, - self.cache_workspace_packages, - package_graph, - ); - match &outcome { - Ok(Some(_)) => { - tracing::Span::current().record("hit", true); - } - Ok(None) => { - tracing::Span::current().record("hit", false); - } - _ => {} - } - outcome - } - - /// Compute the cache key for a given package. - fn cache_key<'a>( - &self, - package_metadata: &'a PackageMetadata, - cargo_fingerprint: &'a str, - package_graph: &PackageGraph, - ) -> Option> { - ThirdPartyCrateCacheKey::build( - package_graph, - package_metadata, - cargo_fingerprint, - self.cache_workspace_packages, - ) - } - - /// Convert the JSON documentation generated by `rustdoc` to the format used by our cache, - /// then store it. - #[instrument( - name = "Convert and cache docs for a third-party crate to disk", - skip_all, - level=tracing::Level::DEBUG, - fields(crate.id = %package_metadata.id(), cache_key = tracing::field::Empty)) - ] - fn convert_and_insert( - &self, - package_metadata: &PackageMetadata, - krate: &crate::rustdoc::Crate, - cargo_fingerprint: &str, - connection: &rusqlite::Connection, - cache_indexes: bool, - package_graph: &PackageGraph, - ) -> Result<(), anyhow::Error> { - let Some(cache_key) = ThirdPartyCrateCacheKey::build( - package_graph, - package_metadata, - cargo_fingerprint, - self.cache_workspace_packages, - ) else { - return Ok(()); - }; - tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key)); - let cached_data = if cache_indexes { - CacheEntry::new(krate) - } else { - CacheEntry::raw(krate) - } - .context("Failed to serialize docs")?; - self.insert(cache_key, connection, cached_data) - } - - /// Store the JSON documentation generated by `rustdoc` in the cache, - /// without having to perform the conversion towards the caching format. - #[instrument( - name = "Stored cache data for third-party crate docs to disk", - skip_all, - level=tracing::Level::DEBUG, - fields(cache_key = tracing::field::Empty)) - ] - fn insert( - &self, - cache_key: ThirdPartyCrateCacheKey<'_>, - connection: &rusqlite::Connection, - cached_data: CacheEntry<'_>, - ) -> Result<(), anyhow::Error> { - tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key)); - let mut stmt = connection.prepare_cached( - "INSERT INTO rustdoc_3d_party_crates_cache ( - crate_name, - crate_source, - crate_version, - crate_hash, - cargo_fingerprint, - rustdoc_options, - default_feature_is_enabled, - active_named_features, - root_item_id, - external_crates, - paths, - format_version, - items, - import_index, - import_path2id, - re_exports, - annotated_items - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - )?; - stmt.execute(params![ - cache_key.crate_name, - cache_key.crate_source, - cache_key.crate_version, - // `NULL` values are considered to be distinct from all other values - // by SQLite, including other `NULL`s. Therefore we use an empty - // string as a placeholder for `NULL` values. - cache_key.crate_hash.unwrap_or_default(), - cache_key.cargo_fingerprint, - cache_key.rustdoc_options, - cache_key.default_feature_is_enabled, - cache_key.active_named_features, - cached_data.root_item_id, - cached_data.external_crates, - cached_data.paths, - cached_data.format_version, - cached_data.items, - cached_data - .secondary_indexes - .as_ref() - .map(|i| i.import_index.as_ref()), - cached_data - .secondary_indexes - .as_ref() - .map(|indexes| indexes.import_path2id.as_ref()), - cached_data - .secondary_indexes - .as_ref() - .map(|indexes| indexes.re_exports.as_ref()), - cached_data - .secondary_indexes - .as_ref() - .map(|indexes| indexes.annotated_items.as_ref()) - ])?; - Ok(()) - } - - fn setup_table(connection: &rusqlite::Connection) -> Result<(), anyhow::Error> { - connection.execute( - "CREATE TABLE IF NOT EXISTS rustdoc_3d_party_crates_cache ( - crate_name TEXT NOT NULL, - crate_source TEXT NOT NULL, - crate_version TEXT NOT NULL, - crate_hash TEXT NOT NULL, - cargo_fingerprint TEXT NOT NULL, - rustdoc_options TEXT NOT NULL, - default_feature_is_enabled INTEGER NOT NULL, - active_named_features TEXT NOT NULL, - root_item_id INTEGER NOT NULL, - external_crates BLOB NOT NULL, - paths BLOB NOT NULL, - format_version INTEGER NOT NULL, - items BLOB NOT NULL, - annotated_items BLOB, - import_index BLOB, - import_path2id BLOB, - re_exports BLOB, - PRIMARY KEY (crate_name, crate_source, crate_version, crate_hash, cargo_fingerprint, rustdoc_options, default_feature_is_enabled, active_named_features) - )", - [] - )?; - Ok(()) - } -} - -#[derive(Debug)] -/// The serialized form of a crate's documentation, as stored in the cache. -pub(in crate::rustdoc) struct CacheEntry<'a> { - root_item_id: u32, - external_crates: Cow<'a, [u8]>, - paths: RkyvCowBytes<'a>, - format_version: i64, - items: RkyvCowBytes<'a>, - secondary_indexes: Option>, -} - -#[derive(Debug)] -/// A `Cow` variant to work with `rkyv`'s `AlignedVec`. -pub(in crate::rustdoc) enum RkyvCowBytes<'a> { - Borrowed(&'a [u8]), - Owned(AlignedVec), -} - -impl ToSql for RkyvCowBytes<'_> { - fn to_sql(&self) -> rusqlite::Result> { - let s = match self { - RkyvCowBytes::Borrowed(items) => items, - RkyvCowBytes::Owned(s) => s.as_slice(), - }; - Ok(ToSqlOutput::Borrowed(rusqlite::types::ValueRef::Blob(s))) - } -} - -impl<'a> RkyvCowBytes<'a> { - pub fn into_owned(self) -> AlignedVec { - match self { - RkyvCowBytes::Borrowed(items) => { - let mut v = AlignedVec::with_capacity(items.len()); - v.extend_from_slice(items); - v - } - RkyvCowBytes::Owned(aligned_vec) => aligned_vec, - } - } -} - -impl<'a> AsRef<[u8]> for RkyvCowBytes<'a> { - fn as_ref(&self) -> &[u8] { - match self { - RkyvCowBytes::Borrowed(items) => items, - RkyvCowBytes::Owned(aligned_vec) => aligned_vec.as_slice(), - } - } -} - -#[derive(Debug)] -/// Data that can be computed starting from the raw JSON documentation for a crate, -/// without having to re-invoke `rustdoc`. -pub(in crate::rustdoc) struct SecondaryIndexes<'a> { - import_index: Cow<'a, [u8]>, - annotated_items: Option>, - import_path2id: RkyvCowBytes<'a>, - re_exports: Cow<'a, [u8]>, -} - -impl<'a> CacheEntry<'a> { - pub fn new(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error> { - let mut cached = Self::raw(krate)?; - let import_index = bincode::serde::encode_to_vec(&krate.import_index, BINCODE_CONFIG)?; - let annotated_items = - bincode::serde::encode_to_vec(&krate.annotated_items, BINCODE_CONFIG)?; - let re_exports = bincode::serde::encode_to_vec(&krate.external_re_exports, BINCODE_CONFIG)?; - - // Serialize the items HashMap using rkyv for zero-copy deserialization later. - let ImportPath2Id::Eager(import_path2id) = &krate.import_path2id else { - anyhow::bail!( - "The crate's import path<>id map is not deserialized. Are we trying to cache \ - the same crate twice? This is a bug." - ); - }; - let import_path2id = - rkyv::to_bytes::(&import_path2id.0).map_err(|e| { - anyhow::anyhow!(e).context("Failed to serialize import path<>id map with rkyv") - })?; - - cached.secondary_indexes = Some(SecondaryIndexes { - import_index: Cow::Owned(import_index), - annotated_items: Some(Cow::Owned(annotated_items)), - import_path2id: RkyvCowBytes::Owned(import_path2id), - re_exports: Cow::Owned(re_exports), - }); - Ok(cached) - } - - /// Cache only `rustdoc`'s output, no secondary indexes. - pub fn raw(krate: &'a crate::rustdoc::Crate) -> Result, anyhow::Error> { - let crate_data = &krate.core.krate; - let CrateItemIndex::Eager(index) = &crate_data.index else { - anyhow::bail!( - "The crate's item index is not deserialized. Are we trying to cache \ - the same crate twice? This is a bug." - ); - }; - let CrateItemPaths::Eager(paths) = &crate_data.paths else { - anyhow::bail!( - "The crate item paths is not deserialized. Are we trying to cache \ - the same crate twice? This is a bug." - ); - }; - - let items = rkyv::to_bytes::(&index.index) - .map_err(|e| anyhow::anyhow!(e).context("Failed to serialize crate items with rkyv"))?; - - let external_crates = - bincode::serde::encode_to_vec(&crate_data.external_crates, BINCODE_CONFIG)?; - let paths = rkyv::to_bytes::(&paths.paths).map_err(|e| { - anyhow::anyhow!(e).context("Failed to serialize item summaries with rkyv") - })?; - - Ok(CacheEntry { - root_item_id: crate_data.root_item_id.0, - external_crates: Cow::Owned(external_crates), - paths: RkyvCowBytes::Owned(paths), - format_version: crate_data.format_version as i64, - items: RkyvCowBytes::Owned(items), - secondary_indexes: None, - }) - } - - /// Re-hydrate the documentation retrieved from the cache. - /// - /// We hydrate all mappings eagerly, but we avoid re-hydrating the item index eagerly, - /// since it can be quite large and deserialization can be slow for large crates. - /// The item index is stored as rkyv-serialized bytes for zero-copy access. - pub(super) fn hydrate(self, package_id: PackageId) -> Result { - let crate_data = CrateData { - root_item_id: rustdoc_types::Id(self.root_item_id.to_owned()), - external_crates: bincode::decode_from_slice(&self.external_crates, BINCODE_CONFIG) - .context("Failed to deserialize external_crates")? - .0, - paths: CrateItemPaths::Lazy(LazyCrateItemPaths { - bytes: self.paths.into_owned(), - }), - format_version: self.format_version.try_into()?, - index: CrateItemIndex::Lazy(LazyCrateItemIndex { - bytes: self.items.into_owned(), - }), - }; - let Some(secondary_indexes) = self.secondary_indexes else { - return Ok(RustdocCacheEntry::Raw(crate_data)); - }; - - let core = crate::rustdoc::queries::CrateCore { - package_id, - krate: crate_data, - }; - - let import_index = - bincode::decode_from_slice(&secondary_indexes.import_index, BINCODE_CONFIG) - .context("Failed to deserialize import_index")? - .0; - - let re_exports = bincode::decode_from_slice(&secondary_indexes.re_exports, BINCODE_CONFIG) - .context("Failed to deserialize re-exports")? - .0; - - let annotated_items = if let Some(data) = secondary_indexes.annotated_items { - bincode::decode_from_slice(&data, BINCODE_CONFIG) - .context("Failed to deserialize annotated_items")? - .0 - } else { - AnnotatedItems::default() - }; - - let krate = crate::rustdoc::Crate { - core, - annotated_items, - import_path2id: ImportPath2Id::Lazy(LazyImportPath2Id( - secondary_indexes.import_path2id.into_owned(), - )), - external_re_exports: re_exports, - import_index, - crate_id2package_id: Default::default(), - }; - Ok(RustdocCacheEntry::Processed(krate)) - } -} - -/// The key used to store and retrieve a crate's documentation from the cache. -/// -/// It tries to capture all the information that can influence the output of the -/// relevant `rustdoc` command. -#[derive(Debug)] -pub(super) struct ThirdPartyCrateCacheKey<'a> { - pub crate_name: &'a str, - pub crate_source: Cow<'a, str>, - pub crate_version: String, - /// The hash of the crate's source code, computed via BLAKE3. - /// It is only populated for path dependencies. - pub crate_hash: Option, - pub cargo_fingerprint: &'a str, - pub rustdoc_options: String, - pub default_feature_is_enabled: bool, - pub active_named_features: String, -} - -impl<'a> ThirdPartyCrateCacheKey<'a> { - /// Compute the cache key for a given package. - pub(super) fn build( - package_graph: &PackageGraph, - package_metadata: &'a PackageMetadata<'a>, - cargo_fingerprint: &'a str, - cache_workspace_packages: bool, - ) -> Option> { - enum PathOrId<'a> { - Path(Cow<'a, Utf8Path>), - Id(&'a str), - } - - impl<'a> From> for Cow<'a, str> { - fn from(val: PathOrId<'a>) -> Self { - match val { - PathOrId::Path(cow) => match cow { - Cow::Owned(path) => Cow::Owned(path.to_string()), - Cow::Borrowed(path) => Cow::Borrowed(path.as_str()), - }, - PathOrId::Id(id) => Cow::Borrowed(id), - } - } - } - - let source = match package_metadata.source() { - guppy::graph::PackageSource::Workspace(p) => { - if !cache_workspace_packages { - return None; - } - let p = package_graph.workspace().root().join(p); - PathOrId::Path(Cow::Owned(p)) - } - guppy::graph::PackageSource::Path(p) => PathOrId::Path(Cow::Borrowed(p)), - guppy::graph::PackageSource::External(e) => PathOrId::Id(e), - }; - let crate_hash = if let PathOrId::Path(package_path) = &source { - let package_path = if package_path.is_relative() { - package_graph.workspace().root().join(package_path) - } else { - package_path.clone().into_owned() - }; - // We need to compute the hash of the package's contents, - // to invalidate the cache when the package changes. - // This is only relevant for path dependencies. - // We don't need to do this for external dependencies, - // since they are assumed to be immutable. - let hash = match checksum_crate(&package_path) { - Ok(hash) => hash, - Err(e) => { - log_error!( - *e, - "Failed to compute the hash of the package at {}. \ - I won't cache its JSON documentation to avoid serving stale data.", - package_metadata.id().repr() - ); - return None; - } - }; - Some(hash.to_string()) - } else { - None - }; - let feature_graph = package_graph.feature_graph(); - let feature_set = feature_graph - .query_workspace(StandardFeatures::Default) - .resolve(); - let features = feature_set - .features_for(package_metadata.id()) - .expect("Failed to determine cargo features"); - let (default_feature_is_enabled, mut active_named_features) = match features { - Some(f) => (f.has_base(), f.named_features().collect()), - None => (false, vec![]), - }; - active_named_features.sort(); - let cache_key = ThirdPartyCrateCacheKey { - crate_name: package_metadata.name(), - crate_source: source.into(), - crate_version: package_metadata.version().to_string(), - crate_hash, - cargo_fingerprint, - default_feature_is_enabled, - // SQLite doesn't support arrays, so we have to serialize these two collections as strings. - // This is well defined, since we sorted features and the order of options is well-defined. - rustdoc_options: rustdoc_options().join(" "), - active_named_features: active_named_features.join(" "), - }; - Some(cache_key) - } -} - -/// Return the output of `cargo --verbose --version` for the nightly toolchain, -/// which can be used to fingerprint the toolchain used by Pavex. -pub fn cargo_fingerprint(toolchain_name: &str) -> Result { - let err_msg = || { - format!( - "Failed to run `cargo --verbose --version` on `{toolchain_name}`.\n\ -Is the `{toolchain_name}` toolchain installed?\n\ -If not, invoke\n - - rustup toolchain install {toolchain_name} -c rust-docs-json - -to fix it.", - ) - }; - let mut cmd = std::process::Command::new("rustup"); - cmd.arg("run") - .arg(toolchain_name) - .arg("cargo") - .arg("--verbose") - .arg("--version"); - let output = cmd.output().with_context(err_msg)?; - if !output.status.success() { - anyhow::bail!(err_msg()); + ) -> Result, anyhow::Error> { + self.inner.get_access_log(project_fingerprint) } - let output = String::from_utf8(output.stdout).with_context(|| { - format!("An invocation of `cargo --verbose --version` for the `{toolchain_name}` toolchain returned non-UTF8 data as output.") - })?; - Ok(output) } diff --git a/compiler/pavexc/src/rustdoc/compute/mod.rs b/compiler/pavexc/src/rustdoc/compute/mod.rs index 333d4433d..a744108e4 100644 --- a/compiler/pavexc/src/rustdoc/compute/mod.rs +++ b/compiler/pavexc/src/rustdoc/compute/mod.rs @@ -3,29 +3,24 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; mod cache; -mod checksum; -mod format; -mod toolchain; use ahash::{HashMap, HashMapExt}; -pub(super) use cache::CacheEntry; -pub(crate) use cache::{RustdocCacheKey, RustdocGlobalFsCache}; +pub(super) use cache::CacheEntryExt; +pub(crate) use cache::{PavexRustdocCache as RustdocGlobalFsCache, RustdocCacheKey}; use anyhow::Context; -use format::check_format; use guppy::graph::PackageGraph; use guppy::{PackageId, Version}; use indexmap::IndexSet; use itertools::Itertools as _; use pavex_cli_shell::SHELL; +use pavexc_rustdoc_cache::{check_format, get_toolchain_crate_docs, rustdoc_options}; use serde::Deserialize; use crate::rustdoc::TOOLCHAIN_CRATES; use crate::rustdoc::package_id_spec::PackageIdSpecification; use crate::rustdoc::utils::normalize_crate_name; -use self::toolchain::get_toolchain_crate_docs; - #[derive(Debug, thiserror::Error, Clone)] #[error( "I failed to retrieve information about the public types of a package in your dependency tree ('{package_spec}')." @@ -201,21 +196,6 @@ where Ok(results) } -/// Return the options to pass to `rustdoc` in order to generate JSON documentation. -/// -/// We isolate this logic in a separate function in order to be able to refer to these -/// options from various places in the codebase and maintain a single source of truth. -/// -/// In particular, they do affect our caching logic (see the `cache` module). -pub(super) fn rustdoc_options() -> [&'static str; 4] { - [ - "--document-private-items", - "-Zunstable-options", - "-wjson", - "--document-hidden-items", - ] -} - #[tracing::instrument(skip_all, fields(package_id_specs, cmd))] fn _compute_crate_docs<'a, I>( toolchain_name: &str, diff --git a/compiler/pavexc/src/rustdoc/mod.rs b/compiler/pavexc/src/rustdoc/mod.rs index 4c7025019..1cdea4a53 100644 --- a/compiler/pavexc/src/rustdoc/mod.rs +++ b/compiler/pavexc/src/rustdoc/mod.rs @@ -7,7 +7,9 @@ use once_cell::sync::Lazy; pub(crate) use annotations::{AnnotatedItem, AnnotationCoordinates, ImplInfo}; pub use compute::CannotGetCrateData; -pub use queries::{Crate, CrateCollection, GlobalItemId, ResolvedItem, RustdocKindExt}; +pub use queries::{ + Crate, CrateCollection, ExternalReExportsExt, GlobalItemId, ResolvedItem, RustdocKindExt, +}; mod annotations; mod compute; diff --git a/compiler/pavexc/src/rustdoc/queries.rs b/compiler/pavexc/src/rustdoc/queries.rs index ab4329d91..47605d325 100644 --- a/compiler/pavexc/src/rustdoc/queries.rs +++ b/compiler/pavexc/src/rustdoc/queries.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::cmp::Ordering; use std::collections::BTreeSet; use std::sync::{Arc, RwLock}; @@ -10,34 +9,30 @@ use guppy::graph::PackageGraph; use guppy::{PackageId, Version}; use indexmap::IndexSet; use rayon::iter::IntoParallelRefIterator; -use rkyv::collections::swiss_table::ArchivedHashMap; -use rkyv::hash::FxHasher64; -use rkyv::rancor::Panic; -use rkyv::string::ArchivedString; -use rkyv::util::AlignedVec; -use rkyv::vec::ArchivedVec; use rustc_hash::FxHashMap; -use rustdoc_types::{ - ArchivedId, ArchivedItem, ArchivedItemSummary, ExternalCrate, Item, ItemEnum, ItemKind, - ItemSummary, Visibility, -}; +use rustdoc_types::{ExternalCrate, Item, ItemEnum, ItemKind, Visibility}; use tracing::Span; use tracing_log_error::log_error; +// Import types from the cache crate +pub use pavexc_rustdoc_cache::{ + AnnotatedItems, CacheEntry, CrateData, CrateItemIndex, CrateItemPaths, + EagerCrateItemIndex, EagerCrateItemPaths, EagerImportPath2Id, EntryVisibility, + ExternalReExport, ExternalReExports, ImportIndex, ImportIndexEntry, ImportPath2Id, +}; + use crate::compiler::resolvers::{GenericBindings, resolve_type}; use crate::diagnostic::DiagnosticSink; use crate::language::{FQGenericArgument, FQPathType, UnknownCrate, krate2package_id}; -use crate::rustdoc::compute::CacheEntry; use crate::rustdoc::version_matcher::VersionMatcher; use crate::rustdoc::{ALLOC_PACKAGE_ID, CORE_PACKAGE_ID, STD_PACKAGE_ID}; use crate::rustdoc::{CannotGetCrateData, TOOLCHAIN_CRATES, utils}; use super::AnnotatedItem; use super::annotations::{ - self, AnnotatedItems, AnnotationCoordinates, QueueItem, invalid_diagnostic_attribute, - parse_pavex_attributes, + self, AnnotationCoordinates, QueueItem, invalid_diagnostic_attribute, parse_pavex_attributes, }; -use super::compute::{RustdocCacheKey, RustdocGlobalFsCache, compute_crate_docs}; +use super::compute::{CacheEntryExt, RustdocCacheKey, RustdocGlobalFsCache, compute_crate_docs}; /// The main entrypoint for accessing the documentation of the crates /// in a specific `PackageGraph`. @@ -262,9 +257,9 @@ impl CrateCollection { .par_iter() .filter_map(|(package_id, krate, cache_indexes)| { let data = if *cache_indexes { - CacheEntry::new(krate) + ::from_crate(krate) } else { - CacheEntry::raw(krate) + ::from_crate_raw(krate) }; let cache_key = RustdocCacheKey::new(package_id, package_graph); match data { @@ -671,150 +666,30 @@ pub struct CrateIdNeedle { maybe_dependent_crate_name: Option, } -#[derive(Debug, Clone)] -/// An index to lookup the id of a type given one of its import paths, either -/// public or private. -/// -/// The index does NOT contain macros, since macros and types live in two -/// different namespaces and can contain items with the same name. -/// E.g. `core::clone::Clone` is both a trait and a derive macro. -/// -/// Since the index can be quite large, we try to avoid deserializing it all at once. -/// -/// The `Eager` variant contains the entire index, fully deserialized. This is what we get -/// when we have had to index the documentation for the crate on the fly. -/// -/// The `Lazy` variant contains the index as a byte array, with entries deserialized on demand. -pub(crate) enum ImportPath2Id { - Eager(EagerImportPath2Id), - Lazy(LazyImportPath2Id), -} - -impl ImportPath2Id { - pub fn get(&self, path: &[String]) -> Option { - match self { - ImportPath2Id::Eager(m) => m.0.get(path).cloned(), - ImportPath2Id::Lazy(m) => m.get_deserialized(path), - } - } -} - -#[derive(Debug, Clone)] -/// See [`ImportPath2Id`] for more information. -pub(crate) struct EagerImportPath2Id(pub HashMap, rustdoc_types::Id>); - -/// See [`ImportPath2Id`] for more information. -/// -/// Stores rkyv-serialized bytes of a `HashMap, Id>` and provides zero-copy access. -#[derive(Debug, Clone)] -pub(crate) struct LazyImportPath2Id(pub AlignedVec); - -impl LazyImportPath2Id { - #[inline] - fn archived(&self) -> &ArchivedHashMap, ArchivedId> { - unsafe { - rkyv::access_unchecked::, ArchivedId>>( - &self.0, - ) - } - } - - pub fn get(&self, path: &[String]) -> Option<&ArchivedId> { - let path_vec: Vec = path.to_vec(); - let bytes = rkyv::to_bytes::(&path_vec).ok()?; - - let archived_key = unsafe { rkyv::access_unchecked::>(&bytes) }; - self.archived().get(archived_key) - } - - pub fn get_deserialized(&self, path: &[String]) -> Option { - let archived = self.get(path)?; - Some(rkyv::deserialize::<_, Panic>(archived).unwrap()) - } -} - -#[derive( - Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, -)] -/// Track re-exports of types (or entire modules!) from other crates. -pub struct ExternalReExports { - /// Key: the path of the re-exported type in the current crate. - /// Value: the id of the `rustdoc` item of kind `use` that performed the re-export. - /// - /// E.g. `pub use hyper::server as sx;` in `lib.rs` would use `vec!["my_crate", "sx"]` - /// as key in this map. - target_path2use_id: HashMap, rustdoc_types::Id>, - /// Key: the id of the `rustdoc` item of kind `use` that performed the re-export. - /// Value: metadata about the re-export. - use_id2re_export: HashMap, -} - -impl ExternalReExports { - /// Iteratore over the external re-exports that have been collected. - pub fn iter( - &self, - ) -> impl Iterator, rustdoc_types::Id, &ExternalReExport)> { - self.target_path2use_id - .iter() - .map(|(target_path, id)| (target_path, *id, &self.use_id2re_export[id])) - } - - /// Add another re-export to the database. - pub fn insert( - &mut self, - krate: &CrateData, - use_item: &rustdoc_types::Item, - current_path: &[String], - ) { - let ItemEnum::Use(use_) = &use_item.inner else { - unreachable!() - }; - let imported_id = use_.id.expect("Import doesn't have an associated id"); - let Some(imported_summary) = krate.paths.get(&imported_id) else { - // TODO: this is firing for std's JSON docs. File a bug report. - // panic!("The imported id ({}) is not listed in the index nor in the path section of rustdoc's JSON output", imported_id.0) - return; - }; - debug_assert!(imported_summary.crate_id != 0); - // We are looking at a public re-export of another crate - // (e.g. `pub use hyper;`), one of its modules or one of its items. - // Due to how re-exports are handled in `rustdoc`, the re-exported - // items inside that foreign module will not be found in the `index` - // for this crate. - // We intentionally add foreign items to the index to get a "complete" - // picture of all the types available in this crate. - let external_crate_id = imported_summary.crate_id; - let source_path = imported_summary.path.to_owned(); - let re_exported_path = { - let mut p = current_path.to_owned(); - if !use_.is_glob { - p.push(use_.name.clone()); - } - p - }; - let re_export = ExternalReExport { - source_path, - external_crate_id, - }; - - self.target_path2use_id - .insert(re_exported_path, use_item.id); - self.use_id2re_export.insert(use_item.id, re_export); - } - +/// Extension trait for [`ExternalReExports`] that adds methods depending on pavexc types. +pub trait ExternalReExportsExt { /// Retrieve the re-exported item from the crate it was defined into. /// /// # Panics /// /// Panics if the provided `use_id` doesn't exist as a key in the re-export registry. - pub fn get_target_item_id( + fn get_target_item_id( &self, // The crate associated with these re-exports. re_exported_from: &Crate, krate_collection: &CrateCollection, use_id: rustdoc_types::Id, + ) -> Result, CannotGetCrateData>; +} + +impl ExternalReExportsExt for ExternalReExports { + fn get_target_item_id( + &self, + re_exported_from: &Crate, + krate_collection: &CrateCollection, + use_id: rustdoc_types::Id, ) -> Result, CannotGetCrateData> { - let re_export = &self.use_id2re_export[&use_id]; + let re_export = self.get(&use_id).expect("use_id not found in re-export registry"); let source_package_id = re_exported_from .core .compute_package_id_for_crate_id(re_export.external_crate_id, krate_collection, None) @@ -830,155 +705,6 @@ impl ExternalReExports { } } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)] -/// Information about a type (or module) re-exported from another crate. -pub struct ExternalReExport { - /// The path of the re-exported type in the crate it was re-exported from. - /// - /// E.g. `pub use hyper::server as sx;` in `lib.rs` would set `source_path` to - /// `vec!["hyper", "server"]`. - source_path: Vec, - /// The id of the source crate in the `external_crates` section of the JSON - /// documentation of the crate that re-exported it. - external_crate_id: u32, -} - -#[derive( - Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, -)] -pub struct ImportIndex { - /// A mapping that keeps track of all modules defined in the current crate. - /// - /// We track modules separately because their names are allowed to collide with - /// type and function names. - pub modules: HashMap, - /// A mapping that keeps track of traits, structs, enums and functions - /// defined in the current crate. - pub items: HashMap, - /// A mapping that associates the id of each re-export (`pub use ...`) to the id - /// of the module it was re-exported from. - pub re_export2parent_module: HashMap, -} - -/// An entry in [`ImportIndex`]. -#[derive( - Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, -)] -pub struct ImportIndexEntry { - /// All the public paths that can be used to import the item. - pub public_paths: BTreeSet, - /// All the private paths that can be used to import the item. - pub private_paths: BTreeSet, - /// The path where the item was originally defined. - /// - /// It may be set to `None` if we can't access the original definition. - /// E.g. an item defined in a private module of `std`, where we only have access - /// to the public API. - pub defined_at: Option>, -} - -/// The visibility of a path inside [`ImportIndexEntry`]. -pub enum EntryVisibility { - /// The item can be imported from outside the crate where it was defined. - Public, - /// The item can only be imported from within the crate where it was defined. - Private, -} - -impl ImportIndexEntry { - /// A private constructor. - fn empty() -> Self { - Self { - public_paths: BTreeSet::new(), - private_paths: BTreeSet::new(), - defined_at: None, - } - } - - /// Create a new entry from a path. - pub fn new(path: Vec, visibility: EntryVisibility, is_definition: bool) -> Self { - let mut entry = Self::empty(); - if is_definition { - entry.defined_at = Some(path.clone()); - } - match visibility { - EntryVisibility::Public => entry.public_paths.insert(SortablePath(path)), - EntryVisibility::Private => entry.private_paths.insert(SortablePath(path)), - }; - entry - } - - /// Add a new private path for this item. - pub fn insert_private(&mut self, path: Vec) { - self.private_paths.insert(SortablePath(path)); - } - - /// Add a new path for this item. - pub fn insert(&mut self, path: Vec, visibility: EntryVisibility) { - match visibility { - EntryVisibility::Public => self.public_paths.insert(SortablePath(path)), - EntryVisibility::Private => self.private_paths.insert(SortablePath(path)), - }; - } - - /// Types can be exposed under multiple paths. - /// This method returns a "canonical" importable path—i.e. the shortest importable path - /// pointing at the type you specified. - /// - /// If the type is public, this method returns the shortest public path. - /// If the type is private, this method returns the shortest private path. - pub fn canonical_path(&self) -> &[String] { - if let Some(SortablePath(p)) = self.public_paths.first() { - return p; - } - if let Some(SortablePath(p)) = self.private_paths.first() { - return p; - } - unreachable!("There must be at least one path associated to an import index entry") - } - - /// Returns all paths associated with the type, both public and private. - pub fn paths(&self) -> impl Iterator { - self.public_paths - .iter() - .map(|SortablePath(p)| p.as_slice()) - .chain( - self.private_paths - .iter() - .map(|SortablePath(p)| p.as_slice()), - ) - } -} - -#[derive( - Debug, - Clone, - Eq, - PartialEq, - serde::Serialize, - serde::Deserialize, - bincode::Encode, - bincode::Decode, -)] -#[serde(transparent)] -pub struct SortablePath(pub Vec); - -impl Ord for SortablePath { - fn cmp(&self, other: &Self) -> Ordering { - match self.0.len().cmp(&other.0.len()) { - // Compare lexicographically if lengths are equal - Ordering::Equal => self.0.cmp(&other.0), - other => other, - } - } -} - -impl PartialOrd for SortablePath { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - #[derive(Debug, Clone)] pub(crate) struct CrateCore { /// The `PackageId` for the corresponding crate within the dependency tree @@ -988,209 +714,6 @@ pub(crate) struct CrateCore { pub(super) krate: CrateData, } -#[derive(Debug, Clone)] -/// The JSON documentation for a crate. -pub(crate) struct CrateData { - /// The id of the root item for the crate. - pub root_item_id: rustdoc_types::Id, - /// A mapping from the id of an external crate to the information about it. - #[allow(clippy::disallowed_types)] - pub external_crates: FxHashMap, - /// A mapping from the id of a type to its fully qualified path. - /// Primarily useful for foreign items that are being re-exported by this crate. - pub paths: CrateItemPaths, - /// The version of the JSON format used by rustdoc. - pub format_version: u32, - /// The index of all the items in the crate. - pub index: CrateItemIndex, -} - -#[derive(Debug, Clone)] -/// A mapping from the id of a type to its fully qualified path. -/// -/// Primarily useful for foreign items that are being re-exported by this crate. -pub(crate) enum CrateItemPaths { - Eager(EagerCrateItemPaths), - Lazy(LazyCrateItemPaths), -} - -impl CrateItemPaths { - /// Retrieve an item summary from the index given its id. - pub fn get(&self, id: &rustdoc_types::Id) -> Option> { - match self { - Self::Eager(m) => m.paths.get(id).map(Cow::Borrowed), - Self::Lazy(m) => { - let item = m.get_deserialized(id)?; - Some(Cow::Owned(item)) - } - } - } - - pub fn iter(&self) -> impl Iterator)> { - match self { - CrateItemPaths::Eager(paths) => CrateItemPathsIter::Eager(paths.paths.iter()), - CrateItemPaths::Lazy(paths) => CrateItemPathsIter::Lazy(paths.archived().iter()), - } - } -} - -pub enum CrateItemPathsIter<'a> { - Eager(std::collections::hash_map::Iter<'a, rustdoc_types::Id, ItemSummary>), - Lazy( - rkyv::collections::swiss_table::map::Iter<'a, ArchivedId, ArchivedItemSummary, FxHasher64>, - ), -} - -pub enum ItemSummaryRef<'a> { - Eager(&'a ItemSummary), - Lazy(&'a ArchivedItemSummary), -} - -impl<'a> ItemSummaryRef<'a> { - pub fn crate_id(&self) -> u32 { - match self { - ItemSummaryRef::Eager(s) => s.crate_id, - ItemSummaryRef::Lazy(s) => s.crate_id.to_native(), - } - } - - pub fn kind(&self) -> ItemKind { - match self { - ItemSummaryRef::Eager(s) => s.kind, - ItemSummaryRef::Lazy(s) => { - // Safe to do since the enum is repr(u8) - rkyv::deserialize::<_, rkyv::rancor::Infallible>(&s.kind).unwrap() - } - } - } - - pub fn path(&self) -> Cow<'_, [String]> { - match self { - ItemSummaryRef::Eager(s) => Cow::Borrowed(&s.path), - ItemSummaryRef::Lazy(s) => { - Cow::Owned(s.path.iter().map(|s| s.as_str().to_owned()).collect()) - } - } - } -} - -impl<'a> Iterator for CrateItemPathsIter<'a> { - type Item = (rustdoc_types::Id, ItemSummaryRef<'a>); - - fn next(&mut self) -> Option { - match self { - Self::Eager(iter) => iter.next().map(|(k, v)| (*k, ItemSummaryRef::Eager(v))), - Self::Lazy(iter) => iter - .next() - .map(|(k, v)| (rustdoc_types::Id(k.0.to_native()), ItemSummaryRef::Lazy(v))), - } - } -} - -#[derive(Debug, Clone)] -/// See [`CrateItemPaths`] for more information. -pub(crate) struct EagerCrateItemPaths { - #[allow(clippy::disallowed_types)] - pub paths: FxHashMap, -} - -/// See [`CrateItemPaths`] for more information. -#[derive(Debug, Clone)] -pub(crate) struct LazyCrateItemPaths { - pub(super) bytes: AlignedVec, -} - -impl LazyCrateItemPaths { - /// Get zero-copy access to the archived HashMap. - #[inline] - fn archived(&self) -> &ArchivedHashMap { - // SAFETY: The bytes were serialized by rkyv from a valid HashMap. - // We trust the cache to contain valid data. - unsafe { - rkyv::access_unchecked::>(&self.bytes) - } - } - - /// Get an item by its ID, returning a reference to the archived summary. - pub fn get(&self, id: &rustdoc_types::Id) -> Option<&ArchivedItemSummary> { - self.archived().get(&ArchivedId(id.0.into())) - } - - /// Deserialize a summary by its ID. - pub fn get_deserialized(&self, id: &rustdoc_types::Id) -> Option { - let archived = self.get(id)?; - Some(rkyv::deserialize::(archived).unwrap()) - } -} - -#[derive(Debug, Clone)] -/// The index of all the items in the crate. -/// -/// Since the index can be quite large, we try to avoid deserializing it all at once. -/// -/// The `Eager` variant contains the entire index, fully deserialized. This is what we get -/// when we have had to compute the documentation for the crate on the fly. -/// -/// The `Lazy` variant contains the index as a byte array. There is a mapping from the -/// id of an item to the start and end index of the item's bytes in the byte array. -/// We can therefore deserialize the item only if we need to access it. -/// Since we only access a tiny portion of the items in the index (especially for large crates), -/// this translates in a significant performance improvement. -pub(crate) enum CrateItemIndex { - Eager(EagerCrateItemIndex), - Lazy(LazyCrateItemIndex), -} - -impl CrateItemIndex { - /// Retrieve an item from the index given its id. - pub fn get(&self, id: &rustdoc_types::Id) -> Option> { - match self { - Self::Eager(index) => index.index.get(id).map(Cow::Borrowed), - Self::Lazy(index) => { - let item = index.get_deserialized(id)?; - Some(Cow::Owned(item)) - } - } - } -} - -#[derive(Debug, Clone)] -/// See [`CrateItemIndex`] for more information. -pub(crate) struct EagerCrateItemIndex { - #[allow(clippy::disallowed_types)] - pub index: FxHashMap, -} - -/// See [`CrateItemIndex`] for more information. -/// -/// Stores rkyv-serialized bytes of a `HashMap` and provides zero-copy access. -#[derive(Debug, Clone)] -pub(crate) struct LazyCrateItemIndex { - /// The rkyv-serialized bytes containing a `HashMap`. - pub(super) bytes: AlignedVec, -} - -impl LazyCrateItemIndex { - /// Get zero-copy access to the archived HashMap. - #[inline] - fn archived(&self) -> &ArchivedHashMap { - // SAFETY: The bytes were serialized by rkyv from a valid HashMap. - // We trust the cache to contain valid data. - unsafe { rkyv::access_unchecked::>(&self.bytes) } - } - - /// Get an item by its ID, returning a reference to the archived item. - pub fn get(&self, id: &rustdoc_types::Id) -> Option<&ArchivedItem> { - self.archived().get(&ArchivedId(id.0.into())) - } - - /// Deserialize an item by its ID. - pub fn get_deserialized(&self, id: &rustdoc_types::Id) -> Option { - let archived = self.get(id)?; - Some(rkyv::deserialize::(archived).unwrap()) - } -} - impl CrateCore { /// Given a crate id, return the corresponding [`PackageId`]. /// diff --git a/compiler/pavexc_rustdoc_cache/Cargo.toml b/compiler/pavexc_rustdoc_cache/Cargo.toml new file mode 100644 index 000000000..a7d10874d --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "pavexc_rustdoc_cache" +description = "JSON documentation caching for the Pavex compiler" +edition.workspace = true +repository.workspace = true +homepage.workspace = true +license.workspace = true +version.workspace = true +keywords = ["pavex"] + +[lints] +clippy = { large_enum_variant = "allow", result_large_err = "allow" } + +[dependencies] +# Core dependencies +rustdoc-types = { workspace = true } +pavexc_attr_parser = { path = "../pavexc_attr_parser", version = "=0.2.10" } +pavex_bp_schema = { path = "../pavex_bp_schema", version = "=0.2.10" } + +# Package graph +guppy = { workspace = true } +camino = { workspace = true } + +# SQLite cache +xdg-home = { workspace = true } +rusqlite = { workspace = true, features = ["bundled"] } +r2d2_sqlite = { workspace = true } +r2d2 = { workspace = true } + +# Serialization +bincode = { workspace = true, features = ["serde"] } +rkyv = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = ["unbounded_depth"] } +serde_stacker = { workspace = true } + +# Utilities +anyhow = { workspace = true } +fs-err = { workspace = true } +xxhash-rust = { workspace = true, features = ["xxh64"] } +globwalk = { workspace = true } +tracing = { workspace = true } +tracing_log_error = { workspace = true } +num_cpus = { workspace = true } +itertools = { workspace = true } +thiserror = { workspace = true } +rustc-hash = { workspace = true } +ahash = { workspace = true } +toml = { workspace = true } +once_cell = { workspace = true } + +px_workspace_hack = { version = "0.1", path = "../../px_workspace_hack" } diff --git a/compiler/pavexc/src/rustdoc/annotations/items.rs b/compiler/pavexc_rustdoc_cache/src/annotations/items.rs similarity index 100% rename from compiler/pavexc/src/rustdoc/annotations/items.rs rename to compiler/pavexc_rustdoc_cache/src/annotations/items.rs diff --git a/compiler/pavexc_rustdoc_cache/src/annotations/mod.rs b/compiler/pavexc_rustdoc_cache/src/annotations/mod.rs new file mode 100644 index 000000000..8d0f2d247 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/annotations/mod.rs @@ -0,0 +1,5 @@ +//! Types for tracking Pavex annotations in rustdoc JSON output. + +mod items; + +pub use items::{AnnotatedItem, AnnotatedItems, IdConflict, ImplInfo}; diff --git a/compiler/pavexc_rustdoc_cache/src/cache/mod.rs b/compiler/pavexc_rustdoc_cache/src/cache/mod.rs new file mode 100644 index 000000000..9cdc1d9e6 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/cache/mod.rs @@ -0,0 +1,313 @@ +//! SQLite-based caching for rustdoc JSON documentation. + +mod third_party; +mod toolchain; + +use std::collections::BTreeSet; + +use anyhow::Context; +use guppy::graph::{PackageGraph, PackageMetadata}; +use guppy::PackageId; +use itertools::Itertools; +use r2d2_sqlite::SqliteConnectionManager; +use rusqlite::params; + +use crate::annotations::AnnotatedItems; +use crate::types::{CacheEntry, CrateData, ExternalReExports, ImportIndex, ImportPath2Id}; +use crate::TOOLCHAIN_CRATES; + +use third_party::ThirdPartyCrateCache; +use toolchain::ToolchainCache; + +pub(crate) static BINCODE_CONFIG: bincode::config::Configuration = bincode::config::standard(); + +/// A cache for storing and retrieving pre-computed JSON documentation generated by `rustdoc`. +/// +/// The cache is shared across all Pavex projects of the current user. +/// It is stored on disk, in the user home directory, using a SQLite database. +#[derive(Debug, Clone)] +pub struct RustdocGlobalFsCache { + cargo_fingerprint: String, + third_party_cache: ThirdPartyCrateCache, + toolchain_cache: ToolchainCache, + connection_pool: r2d2::Pool, +} + +pub enum RustdocCacheKey<'a> { + ThirdPartyCrate(PackageMetadata<'a>), + ToolchainCrate(&'a str), +} + +impl std::fmt::Debug for RustdocCacheKey<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RustdocCacheKey::ThirdPartyCrate(metadata) => f + .debug_struct("ThirdPartyCrate") + .field("id", &metadata.id()) + .field("name", &metadata.name()) + .field("version", &metadata.version()) + .finish(), + RustdocCacheKey::ToolchainCrate(name) => f + .debug_struct("ToolchainCrate") + .field("name", name) + .finish(), + } + } +} + +/// An entry retrieved from the on-disk cache. +pub enum HydratedCacheEntry { + /// Only the "raw" output returned by `rustdoc` was stored in the cache. + /// + /// This happens when the indexing phase emitted one or more diagnostics, + /// thus forcing to go through that step (and report those errors) + /// every single time we attempt a compilation. + Raw(CrateData), + /// The cache holds both the raw `rustdoc` output and our secondary indexes. + /// It's ready to be used as is! + Processed(ProcessedCacheEntry), +} + +/// A fully processed cache entry with all secondary indexes. +pub struct ProcessedCacheEntry { + pub package_id: PackageId, + pub crate_data: CrateData, + pub import_path2id: ImportPath2Id, + pub import_index: ImportIndex, + pub external_re_exports: ExternalReExports, + pub annotated_items: AnnotatedItems, +} + +impl<'a> RustdocCacheKey<'a> { + pub fn new(package_id: &'a PackageId, package_graph: &'a PackageGraph) -> RustdocCacheKey<'a> { + if TOOLCHAIN_CRATES.contains(&package_id.repr()) { + RustdocCacheKey::ToolchainCrate(package_id.repr()) + } else { + RustdocCacheKey::ThirdPartyCrate(package_graph.metadata(package_id).unwrap()) + } + } +} + +impl RustdocGlobalFsCache { + /// Initialize a new instance of the cache. + /// + /// The `cache_fingerprint` is used to determine the database file name. + /// It should change whenever the caching logic changes. + #[tracing::instrument(name = "Initialize on-disk rustdoc cache", skip_all)] + pub fn new( + cache_fingerprint: &str, + toolchain_name: &str, + cache_workspace_package_docs: bool, + package_graph: &PackageGraph, + ) -> Result { + std::thread::scope(|scope| { + let handle = scope.spawn(|| cargo_fingerprint(toolchain_name)); + + let pool = Self::setup_database(cache_fingerprint)?; + let connection = pool.get()?; + let third_party_cache = ThirdPartyCrateCache::new( + &connection, + cache_workspace_package_docs, + package_graph, + )?; + let toolchain_cache = ToolchainCache::new(&connection)?; + let cargo_fingerprint = handle + .join() + .expect("Failed to compute on `cargo`'s fingerprint")?; + Ok(Self { + cargo_fingerprint, + connection_pool: pool, + third_party_cache, + toolchain_cache, + }) + }) + } + + /// Retrieve the cached documentation for a given package, if available. + pub fn get( + &self, + cache_key: &RustdocCacheKey, + package_graph: &PackageGraph, + ) -> Result, anyhow::Error> { + let connection = self.connection_pool.get()?; + match cache_key { + RustdocCacheKey::ThirdPartyCrate(metadata) => self.third_party_cache.get( + metadata, + &self.cargo_fingerprint, + &connection, + package_graph, + ), + RustdocCacheKey::ToolchainCrate(name) => { + self.toolchain_cache + .get(name, &self.cargo_fingerprint, &connection) + } + } + } + + /// Store the JSON documentation for a crate in the cache. + pub fn insert( + &self, + cache_key: &RustdocCacheKey, + cache_entry: CacheEntry, + package_graph: &PackageGraph, + ) -> Result<(), anyhow::Error> { + let connection = self.connection_pool.get()?; + match cache_key { + RustdocCacheKey::ThirdPartyCrate(metadata) => { + let Some(key) = self.third_party_cache.cache_key( + metadata, + &self.cargo_fingerprint, + package_graph, + ) else { + return Ok(()); + }; + self.third_party_cache + .insert(key, &connection, cache_entry) + } + RustdocCacheKey::ToolchainCrate(name) => { + self.toolchain_cache + .insert(name, cache_entry, &self.cargo_fingerprint, &connection) + } + } + } + + #[tracing::instrument(skip_all, level = "trace")] + /// Persist the list of package IDs that were accessed during the processing of the + /// application blueprint for this project. + pub fn persist_access_log( + &self, + package_ids: &BTreeSet, + project_fingerprint: &str, + ) -> Result<(), anyhow::Error> { + let connection = self.connection_pool.get()?; + + let mut stmt = connection.prepare_cached( + "INSERT INTO project2package_id_access_log ( + project_fingerprint, + package_ids + ) VALUES (?, ?) + ON CONFLICT(project_fingerprint) DO UPDATE SET package_ids=excluded.package_ids; + ", + )?; + stmt.execute(params![ + project_fingerprint, + bincode::encode_to_vec( + package_ids.iter().map(|s| s.repr()).collect_vec(), + BINCODE_CONFIG + )? + ])?; + + Ok(()) + } + + #[tracing::instrument(skip_all, level = "trace")] + /// Retrieve the list of package IDs that were accessed during the last time we processed the + /// application blueprint for this project. + /// + /// Returns an empty set if no access log is found for the given project fingerprint. + pub fn get_access_log( + &self, + project_fingerprint: &str, + ) -> Result, anyhow::Error> { + let connection = self.connection_pool.get()?; + + let mut stmt = connection.prepare_cached( + "SELECT package_ids FROM project2package_id_access_log WHERE project_fingerprint = ?", + )?; + let mut rows = stmt.query(params![project_fingerprint])?; + let Some(row) = rows.next()? else { + return Ok(BTreeSet::new()); + }; + + let package_ids: Vec<&str> = + bincode::borrow_decode_from_slice(row.get_ref_unwrap(0).as_bytes()?, BINCODE_CONFIG)?.0; + Ok(package_ids.into_iter().map(PackageId::new).collect()) + } + + /// Initialize the database, creating the file and the relevant tables if they don't exist yet. + fn setup_database( + cache_fingerprint: &str, + ) -> Result, anyhow::Error> { + let cache_dir = xdg_home::home_dir() + .ok_or_else(|| anyhow::anyhow!("Failed to get the user's home directory"))? + .join(".pavex/rustdoc/cache"); + fs_err::create_dir_all(&cache_dir).with_context(|| { + format!( + "Failed to create the cache directory at {}", + cache_dir.to_string_lossy() + ) + })?; + + // For the sake of simplicity, we use a different SQLite database for each version of the + // cache crate. This ensures that we don't have to worry about schema migrations. + // The cost we pay: the user will have to re-generate the documentation for all their crates + // when they upgrade Pavex. + let cache_path = cache_dir.join(format!("{cache_fingerprint}.db")); + + #[derive(Debug)] + struct SqlitePragmas; + + impl r2d2::CustomizeConnection for SqlitePragmas { + fn on_acquire(&self, conn: &mut rusqlite::Connection) -> Result<(), rusqlite::Error> { + conn.execute_batch( + // 250MB memory-mapped, more than enough. + "PRAGMA mmap_size=262144000;", + )?; + Ok(()) + } + } + + let manager = SqliteConnectionManager::file(cache_path); + let pool = r2d2::Pool::builder() + .max_size(num_cpus::get() as u32) + .connection_customizer(Box::new(SqlitePragmas)) + .build(manager) + .context("Failed to open/create a SQLite database to store the contents of pavex's rustdoc cache")?; + + let connection = pool.get()?; + connection.execute_batch( + "PRAGMA journal_mode=WAL; + PRAGMA synchronous=NORMAL;", + )?; + connection.execute( + "CREATE TABLE IF NOT EXISTS project2package_id_access_log ( + project_fingerprint TEXT NOT NULL, + package_ids BLOB NOT NULL, + PRIMARY KEY (project_fingerprint) + )", + [], + )?; + + Ok(pool) + } +} + +/// Return the output of `cargo --verbose --version` for the nightly toolchain, +/// which can be used to fingerprint the toolchain used by Pavex. +pub fn cargo_fingerprint(toolchain_name: &str) -> Result { + let err_msg = || { + format!( + "Failed to run `cargo --verbose --version` on `{toolchain_name}`.\n\ +Is the `{toolchain_name}` toolchain installed?\n\ +If not, invoke\n + + rustup toolchain install {toolchain_name} -c rust-docs-json + +to fix it.", + ) + }; + let mut cmd = std::process::Command::new("rustup"); + cmd.arg("run") + .arg(toolchain_name) + .arg("cargo") + .arg("--verbose") + .arg("--version"); + let output = cmd.output().with_context(err_msg)?; + if !output.status.success() { + anyhow::bail!(err_msg()); + } + let output = String::from_utf8(output.stdout).with_context(|| { + format!("An invocation of `cargo --verbose --version` for the `{toolchain_name}` toolchain returned non-UTF8 data as output.") + })?; + Ok(output) +} diff --git a/compiler/pavexc_rustdoc_cache/src/cache/third_party.rs b/compiler/pavexc_rustdoc_cache/src/cache/third_party.rs new file mode 100644 index 000000000..b788e8919 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/cache/third_party.rs @@ -0,0 +1,375 @@ +//! Cache for third-party crates. + +use std::borrow::Cow; + +use anyhow::Context; +use camino::Utf8Path; +use guppy::graph::feature::StandardFeatures; +use guppy::graph::{PackageGraph, PackageMetadata}; +use rusqlite::params; +use tracing::instrument; +use tracing_log_error::log_error; + +use crate::checksum::checksum_crate; +use crate::rustdoc_options; +use crate::types::{ + CacheEntry, RkyvCowBytes, SecondaryIndexes, ThirdPartyCrateCacheKey, +}; + +use super::HydratedCacheEntry; + +#[derive(Debug, Clone)] +#[non_exhaustive] +pub(super) struct ThirdPartyCrateCache { + pub(super) cache_workspace_packages: bool, +} + +impl ThirdPartyCrateCache { + pub(super) fn new( + connection: &rusqlite::Connection, + cache_workspace_packages: bool, + package_graph: &PackageGraph, + ) -> Result { + Self::setup_table(connection)?; + // Force the creation of the feature graph ahead of our queries. + // It'll be cached internally by the `package_graph`. + let _ = package_graph.feature_graph(); + Ok(Self { + cache_workspace_packages, + }) + } + + /// Retrieve the cached documentation for a given package, if available. + #[instrument(name = "Retrieve third-party crate docs from disk cache", + skip_all, + level=tracing::Level::DEBUG, + fields(crate.id = %package_metadata.id(), cache_key = tracing::field::Empty, hit = tracing::field::Empty) + )] + pub(super) fn get( + &self, + package_metadata: &PackageMetadata, + cargo_fingerprint: &str, + connection: &rusqlite::Connection, + package_graph: &PackageGraph, + ) -> Result, anyhow::Error> { + fn _get( + package_metadata: &PackageMetadata, + cargo_fingerprint: &str, + connection: &rusqlite::Connection, + cache_workspace_packages: bool, + package_graph: &PackageGraph, + ) -> Result, anyhow::Error> { + let Some(cache_key) = ThirdPartyCrateCacheKey::build( + package_graph, + package_metadata, + cargo_fingerprint, + cache_workspace_packages, + ) else { + return Ok(None); + }; + tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key)); + // Retrieve from rustdoc's output from cache, if available. + let mut stmt = connection.prepare_cached( + "SELECT + root_item_id, + external_crates, + paths, + format_version, + items, + import_index, + import_path2id, + re_exports, + annotated_items + FROM rustdoc_3d_party_crates_cache + WHERE crate_name = ? AND + crate_source = ? AND + crate_version = ? AND + crate_hash = ? AND + cargo_fingerprint = ? AND + rustdoc_options = ? AND + default_feature_is_enabled = ? AND + active_named_features = ?", + )?; + let span = tracing::trace_span!("Execute query"); + let guard = span.enter(); + let mut rows = stmt.query(params![ + cache_key.crate_name, + cache_key.crate_source, + cache_key.crate_version, + // `NULL` values are considered to be distinct from all other values + // by SQLite, including other `NULL`s. Therefore we use an empty + // string as a placeholder for `NULL` values. + cache_key.crate_hash.unwrap_or_default(), + cache_key.cargo_fingerprint, + cache_key.rustdoc_options, + cache_key.default_feature_is_enabled, + cache_key.active_named_features + ])?; + let Some(row) = rows.next().context("Failed to fetch next row")? else { + return Ok(None); + }; + drop(guard); + + let root_item_id = row.get_ref_unwrap(0).as_i64()?.try_into()?; + let external_crates = row.get_ref_unwrap(1).as_bytes()?; + let paths = row.get_ref_unwrap(2).as_bytes()?; + let format_version = row.get_ref_unwrap(3).as_i64()?; + let items = row.get_ref_unwrap(4).as_bytes()?; + let import_index = row.get_ref_unwrap(5).as_bytes_or_null()?; + let import_path2id = row.get_ref_unwrap(6).as_bytes_or_null()?; + let re_exports = row.get_ref_unwrap(7).as_bytes_or_null()?; + let annotated_items = row.get_ref_unwrap(8).as_bytes_or_null()?; + + let secondary_indexes = + match (import_index, import_path2id, re_exports, annotated_items) { + ( + Some(import_index), + Some(import_path2id), + Some(re_exports), + Some(annotated_items), + ) => Some(SecondaryIndexes { + import_index: Cow::Borrowed(import_index), + import_path2id: RkyvCowBytes::Borrowed(import_path2id), + re_exports: Cow::Borrowed(re_exports), + annotated_items: Some(Cow::Borrowed(annotated_items)), + }), + _ => None, + }; + + let krate = CacheEntry { + root_item_id, + external_crates: Cow::Borrowed(external_crates), + paths: RkyvCowBytes::Borrowed(paths), + format_version, + items: RkyvCowBytes::Borrowed(items), + secondary_indexes, + } + .hydrate(package_metadata.id().to_owned()) + .context("Failed to re-hydrate the stored docs")?; + + Ok(Some(krate)) + } + let outcome = _get( + package_metadata, + cargo_fingerprint, + connection, + self.cache_workspace_packages, + package_graph, + ); + match &outcome { + Ok(Some(_)) => { + tracing::Span::current().record("hit", true); + } + Ok(None) => { + tracing::Span::current().record("hit", false); + } + _ => {} + } + outcome + } + + /// Compute the cache key for a given package. + pub(super) fn cache_key<'a>( + &self, + package_metadata: &'a PackageMetadata, + cargo_fingerprint: &'a str, + package_graph: &PackageGraph, + ) -> Option> { + ThirdPartyCrateCacheKey::build( + package_graph, + package_metadata, + cargo_fingerprint, + self.cache_workspace_packages, + ) + } + + /// Store the JSON documentation generated by `rustdoc` in the cache. + #[instrument( + name = "Stored cache data for third-party crate docs to disk", + skip_all, + level=tracing::Level::DEBUG, + fields(cache_key = tracing::field::Empty)) + ] + pub(super) fn insert( + &self, + cache_key: ThirdPartyCrateCacheKey<'_>, + connection: &rusqlite::Connection, + cached_data: CacheEntry<'_>, + ) -> Result<(), anyhow::Error> { + tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key)); + let mut stmt = connection.prepare_cached( + "INSERT INTO rustdoc_3d_party_crates_cache ( + crate_name, + crate_source, + crate_version, + crate_hash, + cargo_fingerprint, + rustdoc_options, + default_feature_is_enabled, + active_named_features, + root_item_id, + external_crates, + paths, + format_version, + items, + import_index, + import_path2id, + re_exports, + annotated_items + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + )?; + stmt.execute(params![ + cache_key.crate_name, + cache_key.crate_source, + cache_key.crate_version, + // `NULL` values are considered to be distinct from all other values + // by SQLite, including other `NULL`s. Therefore we use an empty + // string as a placeholder for `NULL` values. + cache_key.crate_hash.unwrap_or_default(), + cache_key.cargo_fingerprint, + cache_key.rustdoc_options, + cache_key.default_feature_is_enabled, + cache_key.active_named_features, + cached_data.root_item_id, + cached_data.external_crates, + cached_data.paths, + cached_data.format_version, + cached_data.items, + cached_data + .secondary_indexes + .as_ref() + .map(|i| i.import_index.as_ref()), + cached_data + .secondary_indexes + .as_ref() + .map(|indexes| indexes.import_path2id.as_ref()), + cached_data + .secondary_indexes + .as_ref() + .map(|indexes| indexes.re_exports.as_ref()), + cached_data + .secondary_indexes + .as_ref() + .map(|indexes| indexes.annotated_items.as_ref()) + ])?; + Ok(()) + } + + fn setup_table(connection: &rusqlite::Connection) -> Result<(), anyhow::Error> { + connection.execute( + "CREATE TABLE IF NOT EXISTS rustdoc_3d_party_crates_cache ( + crate_name TEXT NOT NULL, + crate_source TEXT NOT NULL, + crate_version TEXT NOT NULL, + crate_hash TEXT NOT NULL, + cargo_fingerprint TEXT NOT NULL, + rustdoc_options TEXT NOT NULL, + default_feature_is_enabled INTEGER NOT NULL, + active_named_features TEXT NOT NULL, + root_item_id INTEGER NOT NULL, + external_crates BLOB NOT NULL, + paths BLOB NOT NULL, + format_version INTEGER NOT NULL, + items BLOB NOT NULL, + annotated_items BLOB, + import_index BLOB, + import_path2id BLOB, + re_exports BLOB, + PRIMARY KEY (crate_name, crate_source, crate_version, crate_hash, cargo_fingerprint, rustdoc_options, default_feature_is_enabled, active_named_features) + )", + [] + )?; + Ok(()) + } +} + +impl<'a> ThirdPartyCrateCacheKey<'a> { + /// Compute the cache key for a given package. + pub fn build( + package_graph: &PackageGraph, + package_metadata: &'a PackageMetadata<'a>, + cargo_fingerprint: &'a str, + cache_workspace_packages: bool, + ) -> Option> { + enum PathOrId<'a> { + Path(Cow<'a, Utf8Path>), + Id(&'a str), + } + + impl<'a> From> for Cow<'a, str> { + fn from(val: PathOrId<'a>) -> Self { + match val { + PathOrId::Path(cow) => match cow { + Cow::Owned(path) => Cow::Owned(path.to_string()), + Cow::Borrowed(path) => Cow::Borrowed(path.as_str()), + }, + PathOrId::Id(id) => Cow::Borrowed(id), + } + } + } + + let source = match package_metadata.source() { + guppy::graph::PackageSource::Workspace(p) => { + if !cache_workspace_packages { + return None; + } + let p = package_graph.workspace().root().join(p); + PathOrId::Path(Cow::Owned(p)) + } + guppy::graph::PackageSource::Path(p) => PathOrId::Path(Cow::Borrowed(p)), + guppy::graph::PackageSource::External(e) => PathOrId::Id(e), + }; + let crate_hash = if let PathOrId::Path(package_path) = &source { + let package_path = if package_path.is_relative() { + package_graph.workspace().root().join(package_path) + } else { + package_path.clone().into_owned() + }; + // We need to compute the hash of the package's contents, + // to invalidate the cache when the package changes. + // This is only relevant for path dependencies. + // We don't need to do this for external dependencies, + // since they are assumed to be immutable. + let hash = match checksum_crate(&package_path) { + Ok(hash) => hash, + Err(e) => { + log_error!( + *e, + "Failed to compute the hash of the package at {}. \ + I won't cache its JSON documentation to avoid serving stale data.", + package_metadata.id().repr() + ); + return None; + } + }; + Some(hash.to_string()) + } else { + None + }; + let feature_graph = package_graph.feature_graph(); + let feature_set = feature_graph + .query_workspace(StandardFeatures::Default) + .resolve(); + let features = feature_set + .features_for(package_metadata.id()) + .expect("Failed to determine cargo features"); + let (default_feature_is_enabled, mut active_named_features) = match features { + Some(f) => (f.has_base(), f.named_features().collect()), + None => (false, vec![]), + }; + active_named_features.sort(); + let cache_key = ThirdPartyCrateCacheKey { + crate_name: package_metadata.name(), + crate_source: source.into(), + crate_version: package_metadata.version().to_string(), + crate_hash, + cargo_fingerprint, + default_feature_is_enabled, + // SQLite doesn't support arrays, so we have to serialize these two collections as strings. + // This is well defined, since we sorted features and the order of options is well-defined. + rustdoc_options: rustdoc_options().join(" "), + active_named_features: active_named_features.join(" "), + }; + Some(cache_key) + } +} diff --git a/compiler/pavexc_rustdoc_cache/src/cache/toolchain.rs b/compiler/pavexc_rustdoc_cache/src/cache/toolchain.rs new file mode 100644 index 000000000..14f57ccd0 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/cache/toolchain.rs @@ -0,0 +1,218 @@ +//! Cache for toolchain crates (std, core, alloc). + +use std::borrow::Cow; + +use guppy::PackageId; +use rusqlite::params; +use tracing::instrument; + +use crate::annotations::AnnotatedItems; +use crate::types::{ + CacheEntry, CrateData, CrateItemIndex, CrateItemPaths, ImportPath2Id, LazyCrateItemIndex, + LazyCrateItemPaths, LazyImportPath2Id, RkyvCowBytes, SecondaryIndexes, +}; + +use super::{ProcessedCacheEntry, HydratedCacheEntry, BINCODE_CONFIG}; + +#[derive(Debug, Clone)] +#[non_exhaustive] +pub(super) struct ToolchainCache {} + +impl ToolchainCache { + pub(super) fn new(connection: &rusqlite::Connection) -> Result { + Self::setup_table(connection)?; + Ok(Self {}) + } + + /// Retrieve the cached documentation for a given toolchain crate, if available. + #[instrument(name = "Retrieve cached toolchain docs from disk", + skip_all, + level=tracing::Level::DEBUG, + fields(crate.name = %name) + )] + pub(super) fn get( + &self, + name: &str, + cargo_fingerprint: &str, + connection: &rusqlite::Connection, + ) -> Result, anyhow::Error> { + // Retrieve from rustdoc's output from cache, if available. + let mut stmt = connection.prepare_cached( + "SELECT + root_item_id, + external_crates, + paths, + format_version, + items, + import_index, + import_path2id, + re_exports + FROM rustdoc_toolchain_crates_cache + WHERE name = ? AND cargo_fingerprint = ?", + )?; + + let span = tracing::trace_span!("Execute query"); + let guard = span.enter(); + let mut rows = stmt.query(params![name, cargo_fingerprint])?; + let Some(row) = rows.next()? else { + return Ok(None); + }; + drop(guard); + + let root_item_id = row.get_ref_unwrap(0).as_i64()?.try_into()?; + let external_crates = row.get_ref_unwrap(1).as_bytes()?; + let paths = row.get_ref_unwrap(2).as_bytes()?; + let format_version = row.get_ref_unwrap(3).as_i64()?; + + let items = row.get_ref_unwrap(4).as_bytes()?; + + let import_index = row.get_ref_unwrap(5).as_bytes()?; + let import_path2id = row.get_ref_unwrap(6).as_bytes()?; + let re_exports = row.get_ref_unwrap(7).as_bytes()?; + + let krate = CacheEntry { + root_item_id, + external_crates: Cow::Borrowed(external_crates), + paths: RkyvCowBytes::Borrowed(paths), + format_version, + items: RkyvCowBytes::Borrowed(items), + secondary_indexes: Some(SecondaryIndexes { + import_index: Cow::Borrowed(import_index), + // Standard library crates don't have Pavex annotations. + annotated_items: None, + import_path2id: RkyvCowBytes::Borrowed(import_path2id), + re_exports: Cow::Borrowed(re_exports), + }), + } + .hydrate(PackageId::new(name))?; + + Ok(Some(krate)) + } + + /// Store the JSON documentation for a toolchain crate in the cache. + #[instrument(name = "Cache rustdoc output on disk", skip_all, level=tracing::Level::DEBUG, fields(crate.name = name))] + pub(super) fn insert( + &self, + name: &str, + cache_entry: CacheEntry<'_>, + cargo_fingerprint: &str, + connection: &rusqlite::Connection, + ) -> Result<(), anyhow::Error> { + let mut stmt = connection.prepare_cached( + "INSERT INTO rustdoc_toolchain_crates_cache ( + name, + cargo_fingerprint, + root_item_id, + external_crates, + paths, + format_version, + items, + import_index, + import_path2id, + re_exports + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + )?; + stmt.execute(params![ + name, + cargo_fingerprint, + cache_entry.root_item_id, + cache_entry.external_crates, + cache_entry.paths, + cache_entry.format_version, + cache_entry.items, + cache_entry + .secondary_indexes + .as_ref() + .expect("Indexing never fails for toolchain crates") + .import_index, + cache_entry + .secondary_indexes + .as_ref() + .expect("Indexing never fails for toolchain crates") + .import_path2id, + cache_entry + .secondary_indexes + .as_ref() + .expect("Indexing never fails for toolchain crates") + .re_exports + ])?; + Ok(()) + } + + fn setup_table(connection: &rusqlite::Connection) -> Result<(), anyhow::Error> { + connection.execute( + "CREATE TABLE IF NOT EXISTS rustdoc_toolchain_crates_cache ( + name TEXT NOT NULL, + cargo_fingerprint TEXT NOT NULL, + root_item_id INTEGER NOT NULL, + external_crates BLOB NOT NULL, + paths BLOB NOT NULL, + format_version INTEGER NOT NULL, + items BLOB NOT NULL, + import_index BLOB NOT NULL, + import_path2id BLOB NOT NULL, + re_exports BLOB NOT NULL, + PRIMARY KEY (name, cargo_fingerprint) + )", + [], + )?; + Ok(()) + } +} + +impl CacheEntry<'_> { + /// Re-hydrate the documentation retrieved from the cache. + /// + /// We hydrate all mappings eagerly, but we avoid re-hydrating the item index eagerly, + /// since it can be quite large and deserialization can be slow for large crates. + /// The item index is stored as rkyv-serialized bytes for zero-copy access. + pub fn hydrate(self, package_id: PackageId) -> Result { + use anyhow::Context; + + let crate_data = CrateData { + root_item_id: rustdoc_types::Id(self.root_item_id.to_owned()), + external_crates: bincode::decode_from_slice(&self.external_crates, BINCODE_CONFIG) + .context("Failed to deserialize external_crates")? + .0, + paths: CrateItemPaths::Lazy(LazyCrateItemPaths { + bytes: self.paths.into_owned(), + }), + format_version: self.format_version.try_into()?, + index: CrateItemIndex::Lazy(LazyCrateItemIndex { + bytes: self.items.into_owned(), + }), + }; + let Some(secondary_indexes) = self.secondary_indexes else { + return Ok(HydratedCacheEntry::Raw(crate_data)); + }; + + let import_index = + bincode::decode_from_slice(&secondary_indexes.import_index, BINCODE_CONFIG) + .context("Failed to deserialize import_index")? + .0; + + let re_exports = bincode::decode_from_slice(&secondary_indexes.re_exports, BINCODE_CONFIG) + .context("Failed to deserialize re-exports")? + .0; + + let annotated_items = if let Some(data) = secondary_indexes.annotated_items { + bincode::decode_from_slice(&data, BINCODE_CONFIG) + .context("Failed to deserialize annotated_items")? + .0 + } else { + AnnotatedItems::default() + }; + + let processed = ProcessedCacheEntry { + package_id, + crate_data, + import_path2id: ImportPath2Id::Lazy(LazyImportPath2Id( + secondary_indexes.import_path2id.into_owned(), + )), + external_re_exports: re_exports, + import_index, + annotated_items, + }; + Ok(HydratedCacheEntry::Processed(processed)) + } +} diff --git a/compiler/pavexc/src/rustdoc/compute/checksum.rs b/compiler/pavexc_rustdoc_cache/src/checksum.rs similarity index 97% rename from compiler/pavexc/src/rustdoc/compute/checksum.rs rename to compiler/pavexc_rustdoc_cache/src/checksum.rs index 8bdeab5d8..a8711269d 100644 --- a/compiler/pavexc/src/rustdoc/compute/checksum.rs +++ b/compiler/pavexc_rustdoc_cache/src/checksum.rs @@ -12,7 +12,7 @@ use camino::Utf8Path; /// 1. Determine which files are in scope. /// 2. Calculate the checksum of everything that was discovered, including the file names. #[tracing::instrument("Checksum crate files", level = tracing::Level::DEBUG)] -pub(super) fn checksum_crate(root_path: &Utf8Path) -> Result { +pub fn checksum_crate(root_path: &Utf8Path) -> Result { let paths = get_file_paths(root_path)?; let mut hasher = xxhash_rust::xxh64::Xxh64::new(24); diff --git a/compiler/pavexc/src/rustdoc/compute/format.rs b/compiler/pavexc_rustdoc_cache/src/format.rs similarity index 91% rename from compiler/pavexc/src/rustdoc/compute/format.rs rename to compiler/pavexc_rustdoc_cache/src/format.rs index 878357259..cbfb27c90 100644 --- a/compiler/pavexc/src/rustdoc/compute/format.rs +++ b/compiler/pavexc_rustdoc_cache/src/format.rs @@ -7,7 +7,7 @@ struct CrateMeta { } /// Check that the JSON docs we are working with using the expected format version. -pub(super) fn check_format(raw_json: R) -> Result<(), anyhow::Error> { +pub fn check_format(raw_json: R) -> Result<(), anyhow::Error> { let Ok(min_krate) = serde_json::from_reader::(raw_json) else { anyhow::bail!( "Failed to deserialize the `format_version` of the generated JSON docs. Is it actually the JSON documentation for a crate?" diff --git a/compiler/pavexc_rustdoc_cache/src/lib.rs b/compiler/pavexc_rustdoc_cache/src/lib.rs new file mode 100644 index 000000000..f668e7aae --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/lib.rs @@ -0,0 +1,53 @@ +//! This crate encapsulates the logic required to cache and retrieve JSON documentation +//! generated by `rustdoc` for crates in a Pavex project's dependency graph. +//! +//! The cache is stored in a SQLite database at `~/.pavex/rustdoc/cache/{fingerprint}.db`. + +pub mod annotations; +mod cache; +mod checksum; +mod format; +mod toolchain; +mod types; + +pub use annotations::{AnnotatedItem, AnnotatedItems, IdConflict, ImplInfo}; +pub use cache::{ + ProcessedCacheEntry, HydratedCacheEntry, RustdocCacheKey, RustdocGlobalFsCache, + cargo_fingerprint, +}; +pub use checksum::checksum_crate; +pub use format::check_format; +pub use toolchain::get_toolchain_crate_docs; +pub use types::*; + +/// Crate version - used as part of cache fingerprint. +pub const CRATE_VERSION: &str = env!("CARGO_PKG_VERSION"); + +/// Standard library crate package ID representation. +pub const STD_PACKAGE_ID_REPR: &str = "std"; +/// Core crate package ID representation. +pub const CORE_PACKAGE_ID_REPR: &str = "core"; +/// Alloc crate package ID representation. +pub const ALLOC_PACKAGE_ID_REPR: &str = "alloc"; + +/// The set of toolchain crates that are bundled with Rust. +pub const TOOLCHAIN_CRATES: [&str; 3] = [ + STD_PACKAGE_ID_REPR, + CORE_PACKAGE_ID_REPR, + ALLOC_PACKAGE_ID_REPR, +]; + +/// Return the options to pass to `rustdoc` in order to generate JSON documentation. +/// +/// We isolate this logic in a separate function in order to be able to refer to these +/// options from various places in the codebase and maintain a single source of truth. +/// +/// In particular, they do affect our caching logic (see the `cache` module). +pub fn rustdoc_options() -> [&'static str; 4] { + [ + "--document-private-items", + "-Zunstable-options", + "-wjson", + "--document-hidden-items", + ] +} diff --git a/compiler/pavexc/src/rustdoc/compute/toolchain.rs b/compiler/pavexc_rustdoc_cache/src/toolchain.rs similarity index 86% rename from compiler/pavexc/src/rustdoc/compute/toolchain.rs rename to compiler/pavexc_rustdoc_cache/src/toolchain.rs index b48814927..d53b50c1d 100644 --- a/compiler/pavexc/src/rustdoc/compute/toolchain.rs +++ b/compiler/pavexc_rustdoc_cache/src/toolchain.rs @@ -3,7 +3,7 @@ use once_cell::sync::OnceCell; use rustdoc_types::ItemKind; use std::path::PathBuf; -use crate::rustdoc::compute::format::check_format; +use crate::format::check_format; #[tracing::instrument( skip_all, @@ -11,7 +11,7 @@ use crate::rustdoc::compute::format::check_format; crate.name = name, ) )] -pub(crate) fn get_toolchain_crate_docs( +pub fn get_toolchain_crate_docs( name: &str, toolchain_name: &str, ) -> Result { @@ -22,16 +22,16 @@ pub(crate) fn get_toolchain_crate_docs( let mut krate = match serde_json::from_str::(&json) { Ok(krate) => krate, Err(e) => { - return match check_format(std::io::Cursor::new(json)) { Err(format_err) => { - Err(format_err).with_context(|| { + return match check_format(std::io::Cursor::new(json)) { + Err(format_err) => Err(format_err).with_context(|| { format!( - "The JSON docs for {name} are not in the expected format. Are you using the right version of the `nightly` toolchain, `{}`, to generate the JSON docs?", - crate::DEFAULT_DOCS_TOOLCHAIN + "The JSON docs for {name} are not in the expected format. \ + Are you using the right version of the `nightly` toolchain, `{toolchain_name}`, \ + to generate the JSON docs?" ) - }) - } _ => { - Err(e).with_context(|| format!("Failed to deserialize the JSON docs for {name}")) - }}; + }), + _ => Err(e).with_context(|| format!("Failed to deserialize the JSON docs for {name}")), + }; } }; @@ -72,7 +72,7 @@ fn get_toolchain_root_folder_via_rustup(name: &str) -> Result = OnceCell::new(); -pub(super) fn get_cargo_via_rustup(toolchain_name: &str) -> Result { +pub fn get_cargo_via_rustup(toolchain_name: &str) -> Result { fn compute_cargo_via_rustup(toolchain_name: &str) -> Result { let mut cmd = std::process::Command::new("rustup"); cmd.arg("which") diff --git a/compiler/pavexc_rustdoc_cache/src/types/cache_entry.rs b/compiler/pavexc_rustdoc_cache/src/types/cache_entry.rs new file mode 100644 index 000000000..28cfacd0f --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/cache_entry.rs @@ -0,0 +1,84 @@ +//! Types for serialized cache entries. + +use std::borrow::Cow; + +use rkyv::util::AlignedVec; +use rusqlite::{ToSql, types::ToSqlOutput}; + +/// A `Cow` variant to work with `rkyv`'s `AlignedVec`. +#[derive(Debug)] +pub enum RkyvCowBytes<'a> { + Borrowed(&'a [u8]), + Owned(AlignedVec), +} + +impl ToSql for RkyvCowBytes<'_> { + fn to_sql(&self) -> rusqlite::Result> { + let s = match self { + RkyvCowBytes::Borrowed(items) => items, + RkyvCowBytes::Owned(s) => s.as_slice(), + }; + Ok(ToSqlOutput::Borrowed(rusqlite::types::ValueRef::Blob(s))) + } +} + +impl RkyvCowBytes<'_> { + pub fn into_owned(self) -> AlignedVec { + match self { + RkyvCowBytes::Borrowed(items) => { + let mut v = AlignedVec::with_capacity(items.len()); + v.extend_from_slice(items); + v + } + RkyvCowBytes::Owned(aligned_vec) => aligned_vec, + } + } +} + +impl AsRef<[u8]> for RkyvCowBytes<'_> { + fn as_ref(&self) -> &[u8] { + match self { + RkyvCowBytes::Borrowed(items) => items, + RkyvCowBytes::Owned(aligned_vec) => aligned_vec.as_slice(), + } + } +} + +/// Data that can be computed starting from the raw JSON documentation for a crate, +/// without having to re-invoke `rustdoc`. +#[derive(Debug)] +pub struct SecondaryIndexes<'a> { + pub import_index: Cow<'a, [u8]>, + pub annotated_items: Option>, + pub import_path2id: RkyvCowBytes<'a>, + pub re_exports: Cow<'a, [u8]>, +} + +/// The serialized form of a crate's documentation, as stored in the cache. +#[derive(Debug)] +pub struct CacheEntry<'a> { + pub root_item_id: u32, + pub external_crates: Cow<'a, [u8]>, + pub paths: RkyvCowBytes<'a>, + pub format_version: i64, + pub items: RkyvCowBytes<'a>, + pub secondary_indexes: Option>, +} + +/// The key used to store and retrieve a crate's documentation from the cache. +/// +/// It tries to capture all the information that can influence the output of the +/// relevant `rustdoc` command. +#[derive(Debug)] +pub struct ThirdPartyCrateCacheKey<'a> { + pub crate_name: &'a str, + pub crate_source: Cow<'a, str>, + pub crate_version: String, + /// The hash of the crate's source code. + /// It is only populated for path dependencies. + pub crate_hash: Option, + pub cargo_fingerprint: &'a str, + pub rustdoc_options: String, + pub default_feature_is_enabled: bool, + pub active_named_features: String, +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/crate_data.rs b/compiler/pavexc_rustdoc_cache/src/types/crate_data.rs new file mode 100644 index 000000000..a60d14fd6 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/crate_data.rs @@ -0,0 +1,23 @@ +//! Core rustdoc data types. + +use rustc_hash::FxHashMap; +use rustdoc_types::ExternalCrate; + +use super::{CrateItemIndex, CrateItemPaths}; + +/// The JSON documentation for a crate. +#[derive(Debug, Clone)] +pub struct CrateData { + /// The id of the root item for the crate. + pub root_item_id: rustdoc_types::Id, + /// A mapping from the id of an external crate to the information about it. + #[allow(clippy::disallowed_types)] + pub external_crates: FxHashMap, + /// A mapping from the id of a type to its fully qualified path. + /// Primarily useful for foreign items that are being re-exported by this crate. + pub paths: CrateItemPaths, + /// The version of the JSON format used by rustdoc. + pub format_version: u32, + /// The index of all the items in the crate. + pub index: CrateItemIndex, +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/import_index.rs b/compiler/pavexc_rustdoc_cache/src/types/import_index.rs new file mode 100644 index 000000000..c42ae7233 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/import_index.rs @@ -0,0 +1,143 @@ +//! Index of importable items in a crate. + +use std::cmp::Ordering; +use std::collections::BTreeSet; + +use ahash::HashMap; + +/// An index of all importable items in a crate. +#[derive( + Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, +)] +pub struct ImportIndex { + /// A mapping that keeps track of all modules defined in the current crate. + /// + /// We track modules separately because their names are allowed to collide with + /// type and function names. + pub modules: HashMap, + /// A mapping that keeps track of traits, structs, enums and functions + /// defined in the current crate. + pub items: HashMap, + /// A mapping that associates the id of each re-export (`pub use ...`) to the id + /// of the module it was re-exported from. + pub re_export2parent_module: HashMap, +} + +/// An entry in [`ImportIndex`]. +#[derive( + Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, +)] +pub struct ImportIndexEntry { + /// All the public paths that can be used to import the item. + pub public_paths: BTreeSet, + /// All the private paths that can be used to import the item. + pub private_paths: BTreeSet, + /// The path where the item was originally defined. + /// + /// It may be set to `None` if we can't access the original definition. + /// E.g. an item defined in a private module of `std`, where we only have access + /// to the public API. + pub defined_at: Option>, +} + +/// The visibility of a path inside [`ImportIndexEntry`]. +pub enum EntryVisibility { + /// The item can be imported from outside the crate where it was defined. + Public, + /// The item can only be imported from within the crate where it was defined. + Private, +} + +impl ImportIndexEntry { + /// A private constructor. + pub fn empty() -> Self { + Self { + public_paths: BTreeSet::new(), + private_paths: BTreeSet::new(), + defined_at: None, + } + } + + /// Create a new entry from a path. + pub fn new(path: Vec, visibility: EntryVisibility, is_definition: bool) -> Self { + let mut entry = Self::empty(); + if is_definition { + entry.defined_at = Some(path.clone()); + } + match visibility { + EntryVisibility::Public => entry.public_paths.insert(SortablePath(path)), + EntryVisibility::Private => entry.private_paths.insert(SortablePath(path)), + }; + entry + } + + /// Add a new private path for this item. + pub fn insert_private(&mut self, path: Vec) { + self.private_paths.insert(SortablePath(path)); + } + + /// Add a new path for this item. + pub fn insert(&mut self, path: Vec, visibility: EntryVisibility) { + match visibility { + EntryVisibility::Public => self.public_paths.insert(SortablePath(path)), + EntryVisibility::Private => self.private_paths.insert(SortablePath(path)), + }; + } + + /// Types can be exposed under multiple paths. + /// This method returns a "canonical" importable path—i.e. the shortest importable path + /// pointing at the type you specified. + /// + /// If the type is public, this method returns the shortest public path. + /// If the type is private, this method returns the shortest private path. + pub fn canonical_path(&self) -> &[String] { + if let Some(SortablePath(p)) = self.public_paths.first() { + return p; + } + if let Some(SortablePath(p)) = self.private_paths.first() { + return p; + } + unreachable!("There must be at least one path associated to an import index entry") + } + + /// Returns all paths associated with the type, both public and private. + pub fn paths(&self) -> impl Iterator { + self.public_paths + .iter() + .map(|SortablePath(p)| p.as_slice()) + .chain( + self.private_paths + .iter() + .map(|SortablePath(p)| p.as_slice()), + ) + } +} + +#[derive( + Debug, + Clone, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + bincode::Encode, + bincode::Decode, +)] +#[serde(transparent)] +pub struct SortablePath(pub Vec); + +impl Ord for SortablePath { + fn cmp(&self, other: &Self) -> Ordering { + match self.0.len().cmp(&other.0.len()) { + // Compare lexicographically if lengths are equal + Ordering::Equal => self.0.cmp(&other.0), + other => other, + } + } +} + +impl PartialOrd for SortablePath { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/import_path.rs b/compiler/pavexc_rustdoc_cache/src/types/import_path.rs new file mode 100644 index 000000000..8e63d53c8 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/import_path.rs @@ -0,0 +1,63 @@ +//! Mapping from import paths to item IDs. + +use ahash::HashMap; +use rkyv::collections::swiss_table::ArchivedHashMap; +use rkyv::rancor::Panic; +use rkyv::string::ArchivedString; +use rkyv::util::AlignedVec; +use rkyv::vec::ArchivedVec; +use rustdoc_types::ArchivedId; + +/// A mapping from import paths to the id of the item they point to. +/// +/// The `Eager` variant contains the entire mapping, fully deserialized. +/// +/// The `Lazy` variant contains the index as a byte array, with entries deserialized on demand. +#[derive(Debug, Clone)] +pub enum ImportPath2Id { + Eager(EagerImportPath2Id), + Lazy(LazyImportPath2Id), +} + +impl ImportPath2Id { + pub fn get(&self, path: &[String]) -> Option { + match self { + ImportPath2Id::Eager(m) => m.0.get(path).cloned(), + ImportPath2Id::Lazy(m) => m.get_deserialized(path), + } + } +} + +/// See [`ImportPath2Id`] for more information. +#[derive(Debug, Clone)] +pub struct EagerImportPath2Id(pub HashMap, rustdoc_types::Id>); + +/// See [`ImportPath2Id`] for more information. +/// +/// Stores rkyv-serialized bytes of a `HashMap, Id>` and provides zero-copy access. +#[derive(Debug, Clone)] +pub struct LazyImportPath2Id(pub AlignedVec); + +impl LazyImportPath2Id { + #[inline] + fn archived(&self) -> &ArchivedHashMap, ArchivedId> { + unsafe { + rkyv::access_unchecked::, ArchivedId>>( + &self.0, + ) + } + } + + pub fn get(&self, path: &[String]) -> Option<&ArchivedId> { + let path_vec: Vec = path.to_vec(); + let bytes = rkyv::to_bytes::(&path_vec).ok()?; + + let archived_key = unsafe { rkyv::access_unchecked::>(&bytes) }; + self.archived().get(archived_key) + } + + pub fn get_deserialized(&self, path: &[String]) -> Option { + let archived = self.get(path)?; + Some(rkyv::deserialize::<_, Panic>(archived).unwrap()) + } +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/item_index.rs b/compiler/pavexc_rustdoc_cache/src/types/item_index.rs new file mode 100644 index 000000000..7f5d53b2a --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/item_index.rs @@ -0,0 +1,77 @@ +//! Index of all items in a crate. + +use std::borrow::Cow; + +use rkyv::collections::swiss_table::ArchivedHashMap; +use rkyv::rancor::Panic; +use rkyv::util::AlignedVec; +use rustc_hash::FxHashMap; +use rustdoc_types::{ArchivedId, ArchivedItem, Item}; + +/// The index of all the items in the crate. +/// +/// Since the index can be quite large, we try to avoid deserializing it all at once. +/// +/// The `Eager` variant contains the entire index, fully deserialized. This is what we get +/// when we have had to compute the documentation for the crate on the fly. +/// +/// The `Lazy` variant contains the index as a byte array. There is a mapping from the +/// id of an item to the start and end index of the item's bytes in the byte array. +/// We can therefore deserialize the item only if we need to access it. +/// Since we only access a tiny portion of the items in the index (especially for large crates), +/// this translates in a significant performance improvement. +#[derive(Debug, Clone)] +pub enum CrateItemIndex { + Eager(EagerCrateItemIndex), + Lazy(LazyCrateItemIndex), +} + +impl CrateItemIndex { + /// Retrieve an item from the index given its id. + pub fn get(&self, id: &rustdoc_types::Id) -> Option> { + match self { + Self::Eager(index) => index.index.get(id).map(Cow::Borrowed), + Self::Lazy(index) => { + let item = index.get_deserialized(id)?; + Some(Cow::Owned(item)) + } + } + } +} + +/// See [`CrateItemIndex`] for more information. +#[derive(Debug, Clone)] +pub struct EagerCrateItemIndex { + #[allow(clippy::disallowed_types)] + pub index: FxHashMap, +} + +/// See [`CrateItemIndex`] for more information. +/// +/// Stores rkyv-serialized bytes of a `HashMap` and provides zero-copy access. +#[derive(Debug, Clone)] +pub struct LazyCrateItemIndex { + /// The rkyv-serialized bytes containing a `HashMap`. + pub bytes: AlignedVec, +} + +impl LazyCrateItemIndex { + /// Get zero-copy access to the archived HashMap. + #[inline] + fn archived(&self) -> &ArchivedHashMap { + // SAFETY: The bytes were serialized by rkyv from a valid HashMap. + // We trust the cache to contain valid data. + unsafe { rkyv::access_unchecked::>(&self.bytes) } + } + + /// Get an item by its ID, returning a reference to the archived item. + pub fn get(&self, id: &rustdoc_types::Id) -> Option<&ArchivedItem> { + self.archived().get(&ArchivedId(id.0.into())) + } + + /// Deserialize an item by its ID. + pub fn get_deserialized(&self, id: &rustdoc_types::Id) -> Option { + let archived = self.get(id)?; + Some(rkyv::deserialize::(archived).unwrap()) + } +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/item_paths.rs b/compiler/pavexc_rustdoc_cache/src/types/item_paths.rs new file mode 100644 index 000000000..a7621acc7 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/item_paths.rs @@ -0,0 +1,128 @@ +//! Mapping from item IDs to their paths. + +use std::borrow::Cow; + +use rkyv::collections::swiss_table::ArchivedHashMap; +use rkyv::hash::FxHasher64; +use rkyv::rancor::Panic; +use rkyv::util::AlignedVec; +use rustc_hash::FxHashMap; +use rustdoc_types::{ArchivedId, ArchivedItemSummary, ItemKind, ItemSummary}; + +/// A mapping from the id of a type to its fully qualified path. +/// +/// Primarily useful for foreign items that are being re-exported by this crate. +#[derive(Debug, Clone)] +pub enum CrateItemPaths { + Eager(EagerCrateItemPaths), + Lazy(LazyCrateItemPaths), +} + +impl CrateItemPaths { + /// Retrieve an item summary from the index given its id. + pub fn get(&self, id: &rustdoc_types::Id) -> Option> { + match self { + Self::Eager(m) => m.paths.get(id).map(Cow::Borrowed), + Self::Lazy(m) => { + let item = m.get_deserialized(id)?; + Some(Cow::Owned(item)) + } + } + } + + pub fn iter(&self) -> impl Iterator)> { + match self { + CrateItemPaths::Eager(paths) => CrateItemPathsIter::Eager(paths.paths.iter()), + CrateItemPaths::Lazy(paths) => CrateItemPathsIter::Lazy(paths.archived().iter()), + } + } +} + +pub enum CrateItemPathsIter<'a> { + Eager(std::collections::hash_map::Iter<'a, rustdoc_types::Id, ItemSummary>), + Lazy( + rkyv::collections::swiss_table::map::Iter<'a, ArchivedId, ArchivedItemSummary, FxHasher64>, + ), +} + +pub enum ItemSummaryRef<'a> { + Eager(&'a ItemSummary), + Lazy(&'a ArchivedItemSummary), +} + +impl<'a> ItemSummaryRef<'a> { + pub fn crate_id(&self) -> u32 { + match self { + ItemSummaryRef::Eager(s) => s.crate_id, + ItemSummaryRef::Lazy(s) => s.crate_id.to_native(), + } + } + + pub fn kind(&self) -> ItemKind { + match self { + ItemSummaryRef::Eager(s) => s.kind, + ItemSummaryRef::Lazy(s) => { + // Safe to do since the enum is repr(u8) + rkyv::deserialize::<_, rkyv::rancor::Infallible>(&s.kind).unwrap() + } + } + } + + pub fn path(&self) -> Cow<'_, [String]> { + match self { + ItemSummaryRef::Eager(s) => Cow::Borrowed(&s.path), + ItemSummaryRef::Lazy(s) => { + Cow::Owned(s.path.iter().map(|s| s.as_str().to_owned()).collect()) + } + } + } +} + +impl<'a> Iterator for CrateItemPathsIter<'a> { + type Item = (rustdoc_types::Id, ItemSummaryRef<'a>); + + fn next(&mut self) -> Option { + match self { + Self::Eager(iter) => iter.next().map(|(k, v)| (*k, ItemSummaryRef::Eager(v))), + Self::Lazy(iter) => iter + .next() + .map(|(k, v)| (rustdoc_types::Id(k.0.to_native()), ItemSummaryRef::Lazy(v))), + } + } +} + +/// See [`CrateItemPaths`] for more information. +#[derive(Debug, Clone)] +pub struct EagerCrateItemPaths { + #[allow(clippy::disallowed_types)] + pub paths: FxHashMap, +} + +/// See [`CrateItemPaths`] for more information. +#[derive(Debug, Clone)] +pub struct LazyCrateItemPaths { + pub bytes: AlignedVec, +} + +impl LazyCrateItemPaths { + /// Get zero-copy access to the archived HashMap. + #[inline] + fn archived(&self) -> &ArchivedHashMap { + // SAFETY: The bytes were serialized by rkyv from a valid HashMap. + // We trust the cache to contain valid data. + unsafe { + rkyv::access_unchecked::>(&self.bytes) + } + } + + /// Get an item by its ID, returning a reference to the archived summary. + pub fn get(&self, id: &rustdoc_types::Id) -> Option<&ArchivedItemSummary> { + self.archived().get(&ArchivedId(id.0.into())) + } + + /// Deserialize a summary by its ID. + pub fn get_deserialized(&self, id: &rustdoc_types::Id) -> Option { + let archived = self.get(id)?; + Some(rkyv::deserialize::(archived).unwrap()) + } +} diff --git a/compiler/pavexc_rustdoc_cache/src/types/mod.rs b/compiler/pavexc_rustdoc_cache/src/types/mod.rs new file mode 100644 index 000000000..4a9098731 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/mod.rs @@ -0,0 +1,19 @@ +//! Types for storing and retrieving rustdoc JSON documentation in the cache. + +mod cache_entry; +mod crate_data; +mod import_index; +mod import_path; +mod item_index; +mod item_paths; +mod re_exports; + +pub use cache_entry::{CacheEntry, RkyvCowBytes, SecondaryIndexes, ThirdPartyCrateCacheKey}; +pub use crate_data::CrateData; +pub use import_index::{EntryVisibility, ImportIndex, ImportIndexEntry, SortablePath}; +pub use import_path::{EagerImportPath2Id, ImportPath2Id, LazyImportPath2Id}; +pub use item_index::{CrateItemIndex, EagerCrateItemIndex, LazyCrateItemIndex}; +pub use item_paths::{ + CrateItemPaths, CrateItemPathsIter, EagerCrateItemPaths, ItemSummaryRef, LazyCrateItemPaths, +}; +pub use re_exports::{ExternalReExport, ExternalReExports}; diff --git a/compiler/pavexc_rustdoc_cache/src/types/re_exports.rs b/compiler/pavexc_rustdoc_cache/src/types/re_exports.rs new file mode 100644 index 000000000..dfa1e0051 --- /dev/null +++ b/compiler/pavexc_rustdoc_cache/src/types/re_exports.rs @@ -0,0 +1,109 @@ +//! Tracking of external re-exports. + +use ahash::HashMap; + +use super::CrateData; + +/// Track re-exports of types (or entire modules!) from other crates. +#[derive( + Debug, Clone, Default, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode, +)] +pub struct ExternalReExports { + /// Key: the path of the re-exported type in the current crate. + /// Value: the id of the `rustdoc` item of kind `use` that performed the re-export. + /// + /// E.g. `pub use hyper::server as sx;` in `lib.rs` would use `vec!["my_crate", "sx"]` + /// as key in this map. + pub(crate) target_path2use_id: HashMap, rustdoc_types::Id>, + /// Key: the id of the `rustdoc` item of kind `use` that performed the re-export. + /// Value: metadata about the re-export. + pub(crate) use_id2re_export: HashMap, +} + +impl ExternalReExports { + /// Iterate over the external re-exports that have been collected. + pub fn iter( + &self, + ) -> impl Iterator, rustdoc_types::Id, &ExternalReExport)> { + self.target_path2use_id + .iter() + .map(|(target_path, id)| (target_path, *id, &self.use_id2re_export[id])) + } + + /// Get metadata about a re-export given the use item id. + pub fn get(&self, use_id: &rustdoc_types::Id) -> Option<&ExternalReExport> { + self.use_id2re_export.get(use_id) + } + + /// Get the use item id for a given target path. + pub fn get_use_id(&self, target_path: &[String]) -> Option { + self.target_path2use_id.get(target_path).copied() + } + + /// Insert a re-export entry. + pub fn insert_entry( + &mut self, + target_path: Vec, + use_id: rustdoc_types::Id, + re_export: ExternalReExport, + ) { + self.target_path2use_id.insert(target_path, use_id); + self.use_id2re_export.insert(use_id, re_export); + } + + /// Add another re-export to the database. + pub fn insert( + &mut self, + krate: &CrateData, + use_item: &rustdoc_types::Item, + current_path: &[String], + ) { + let rustdoc_types::ItemEnum::Use(use_) = &use_item.inner else { + unreachable!() + }; + let imported_id = use_.id.expect("Import doesn't have an associated id"); + let Some(imported_summary) = krate.paths.get(&imported_id) else { + // TODO: this is firing for std's JSON docs. File a bug report. + // panic!("The imported id ({}) is not listed in the index nor in the path section of rustdoc's JSON output", imported_id.0) + return; + }; + debug_assert!(imported_summary.crate_id != 0); + // We are looking at a public re-export of another crate + // (e.g. `pub use hyper;`), one of its modules or one of its items. + // Due to how re-exports are handled in `rustdoc`, the re-exported + // items inside that foreign module will not be found in the `index` + // for this crate. + // We intentionally add foreign items to the index to get a "complete" + // picture of all the types available in this crate. + let external_crate_id = imported_summary.crate_id; + let source_path = imported_summary.path.to_owned(); + let re_exported_path = { + let mut p = current_path.to_owned(); + if !use_.is_glob { + p.push(use_.name.clone()); + } + p + }; + let re_export = ExternalReExport { + source_path, + external_crate_id, + }; + + self.target_path2use_id + .insert(re_exported_path, use_item.id); + self.use_id2re_export.insert(use_item.id, re_export); + } +} + +/// Information about a type (or module) re-exported from another crate. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)] +pub struct ExternalReExport { + /// The path of the re-exported type in the crate it was re-exported from. + /// + /// E.g. `pub use hyper::server as sx;` in `lib.rs` would set `source_path` to + /// `vec!["hyper", "server"]`. + pub source_path: Vec, + /// The id of the source crate in the `external_crates` section of the JSON + /// documentation of the crate that re-exported it. + pub external_crate_id: u32, +} diff --git a/px_workspace_hack/Cargo.toml b/px_workspace_hack/Cargo.toml index fcab4d459..963f6bb8f 100644 --- a/px_workspace_hack/Cargo.toml +++ b/px_workspace_hack/Cargo.toml @@ -21,6 +21,7 @@ ahash = { version = "0.8" } aho-corasick = { version = "1" } base64 = { version = "0.22" } bincode = { version = "2", features = ["serde"] } +bitflags = { version = "2", default-features = false, features = ["serde"] } byteorder = { version = "1" } clap = { version = "4", features = ["derive", "env"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "suggestions", "usage"] } @@ -101,6 +102,7 @@ ahash = { version = "0.8" } aho-corasick = { version = "1" } base64 = { version = "0.22" } bincode = { version = "2", features = ["serde"] } +bitflags = { version = "2", default-features = false, features = ["serde"] } byteorder = { version = "1" } clap = { version = "4", features = ["derive", "env"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "suggestions", "usage"] }