From cda9eabafc5a0ab37730600d23d18fa8434dcf49 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 4 Jan 2026 23:40:42 +0200 Subject: [PATCH] Do not eagerly load entire file contents into memory when decoding it (#45971) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When working on https://github.com/zed-industries/zed/pull/45969 I've noticed that whenever I try to open a binary file ``` ~/Desktop/svenska ❯ du -ha "Svenska А2B1. 07.09.2025 [u6qEIe9-COc].mkv" 456M Svenska А2B1. 07.09.2025 [u6qEIe9-COc].mkv ``` Zed allocates all its size image image only to show me this: image Given that our existing code checks first 1024 bytes to decide whether to bail on a binary file or not, this seems very wasteful — hence, adjusted the code to read the "header" and check that first, and only continue reading the entire file after the checks are successful. I suspect this should also help the project search, esp. the crashes and memory usage during that when many binary files are present? Release Notes: - Improved Zed's memory usage when attempting to open binary files --- crates/worktree/src/worktree.rs | 76 +++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/crates/worktree/src/worktree.rs b/crates/worktree/src/worktree.rs index 7d682c0321569fa0255b740fb537bf8ac2076b87..9c2a448f3aad351470399fd11107028966b6543c 100644 --- a/crates/worktree/src/worktree.rs +++ b/crates/worktree/src/worktree.rs @@ -1359,9 +1359,7 @@ impl LocalWorktree { anyhow::bail!("File is too large to load"); } } - - let content = fs.load_bytes(&abs_path).await?; - let (text, encoding, has_bom) = decode_byte(content)?; + let (text, encoding, has_bom) = decode_file_text(fs.as_ref(), &abs_path).await?; let worktree = this.upgrade().context("worktree was dropped")?; let file = match entry.await? { @@ -5872,14 +5870,76 @@ impl fs::Watcher for NullWatcher { } } -fn decode_byte(bytes: Vec) -> anyhow::Result<(String, &'static Encoding, bool)> { - // check BOM - if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) { +const FILE_ANALYSIS_BYTES: usize = 1024; + +async fn decode_file_text( + fs: &dyn Fs, + abs_path: &Path, +) -> Result<(String, &'static Encoding, bool)> { + let mut file = fs + .open_sync(&abs_path) + .await + .with_context(|| format!("opening file {abs_path:?}"))?; + + // First, read the beginning of the file to determine its kind and encoding. + // We do not want to load an entire large blob into memory only to discard it. + let mut file_first_bytes = Vec::with_capacity(FILE_ANALYSIS_BYTES); + let mut buf = [0u8; FILE_ANALYSIS_BYTES]; + let mut reached_eof = false; + loop { + if file_first_bytes.len() >= FILE_ANALYSIS_BYTES { + break; + } + let n = file + .read(&mut buf) + .with_context(|| format!("reading bytes of the file {abs_path:?}"))?; + if n == 0 { + reached_eof = true; + break; + } + file_first_bytes.extend_from_slice(&buf[..n]); + } + let (bom_encoding, byte_content) = decode_byte_header(&file_first_bytes); + anyhow::ensure!( + byte_content != ByteContent::Binary, + "Binary files are not supported" + ); + + // If the file is eligible for opening, read the rest of the file. + let mut content = file_first_bytes; + if !reached_eof { + let mut buf = [0u8; 8 * 1024]; + loop { + let n = file + .read(&mut buf) + .with_context(|| format!("reading remaining bytes of the file {abs_path:?}"))?; + if n == 0 { + break; + } + content.extend_from_slice(&buf[..n]); + } + } + decode_byte_full(content, bom_encoding, byte_content) +} + +fn decode_byte_header(prefix: &[u8]) -> (Option<&'static Encoding>, ByteContent) { + if let Some((encoding, _bom_len)) = Encoding::for_bom(prefix) { + return (Some(encoding), ByteContent::Unknown); + } + (None, analyze_byte_content(prefix)) +} + +fn decode_byte_full( + bytes: Vec, + bom_encoding: Option<&'static Encoding>, + byte_content: ByteContent, +) -> Result<(String, &'static Encoding, bool)> { + if let Some(encoding) = bom_encoding { let (cow, _) = encoding.decode_with_bom_removal(&bytes); return Ok((cow.into_owned(), encoding, true)); } - match analyze_byte_content(&bytes) { + match byte_content { ByteContent::Utf16Le => { let encoding = encoding_rs::UTF_16LE; let (cow, _, _) = encoding.decode(&bytes); @@ -5942,7 +6002,7 @@ fn analyze_byte_content(bytes: &[u8]) -> ByteContent { return ByteContent::Unknown; } - let check_len = bytes.len().min(1024); + let check_len = bytes.len().min(FILE_ANALYSIS_BYTES); let sample = &bytes[..check_len]; if !sample.contains(&0) {