fix: watchdog for hung wsl bridge process (#166)

Adds a 60-second watchdog that silently kills the Claude Code process if
system:init never arrives, preventing the UI from being stuck in a
"Connected" state indefinitely. Refactors process handle to
Arc<Mutex<Option<Child>>> for cross-thread access, and removes the
unused CommandExt import.
This commit is contained in:
2026-02-25 20:06:18 -08:00
committed by Naomi Carrigan
parent dfc3d71c42
commit dbf5234a42
+75 -21
View File
@@ -1,14 +1,13 @@
use std::io::{BufRead, BufReader, Write};
use std::process::{Child, ChildStdin, Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::{SystemTime, UNIX_EPOCH};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use parking_lot::Mutex;
use tauri::{AppHandle, Emitter};
use tempfile::NamedTempFile;
#[cfg(target_os = "windows")]
use std::os::windows::process::CommandExt;
use crate::achievements::{get_achievement_info, AchievementUnlockedEvent};
use crate::commands::record_cost;
use crate::config::ClaudeStartOptions;
@@ -103,52 +102,58 @@ fn find_claude_binary() -> Option<String> {
}
pub struct WslBridge {
process: Option<Child>,
process: Arc<Mutex<Option<Child>>>,
stdin: Option<ChildStdin>,
working_directory: String,
session_id: Option<String>,
mcp_config_file: Option<NamedTempFile>,
stats: Arc<RwLock<UsageStats>>,
conversation_id: Option<String>,
/// Set to true once the `system:init` message arrives, false at the start of every new session.
received_init: Arc<AtomicBool>,
}
impl WslBridge {
pub fn new() -> Self {
WslBridge {
process: None,
process: Arc::new(Mutex::new(None)),
stdin: None,
working_directory: String::new(),
session_id: None,
mcp_config_file: None,
stats: Arc::new(RwLock::new(UsageStats::new())),
conversation_id: None,
received_init: Arc::new(AtomicBool::new(false)),
}
}
pub fn new_with_conversation_id(conversation_id: String) -> Self {
WslBridge {
process: None,
process: Arc::new(Mutex::new(None)),
stdin: None,
working_directory: String::new(),
session_id: None,
mcp_config_file: None,
stats: Arc::new(RwLock::new(UsageStats::new())),
conversation_id: Some(conversation_id),
received_init: Arc::new(AtomicBool::new(false)),
}
}
pub fn start(&mut self, app: AppHandle, options: ClaudeStartOptions) -> Result<(), String> {
// If a process handle exists but the process has already exited (e.g. due to a
// failed working directory), clean up the stale handle so we can restart cleanly.
if let Some(ref mut process) = self.process {
if process.try_wait().map(|s| s.is_some()).unwrap_or(false) {
self.process = None;
self.stdin = None;
{
let mut proc_guard = self.process.lock();
if let Some(ref mut proc) = *proc_guard {
if proc.try_wait().map(|s| s.is_some()).unwrap_or(false) {
*proc_guard = None;
self.stdin = None;
}
}
if proc_guard.is_some() {
return Err("Process already running".to_string());
}
}
if self.process.is_some() {
return Err("Process already running".to_string());
}
// Load saved achievements and stats when starting a new session
@@ -399,7 +404,10 @@ impl WslBridge {
let stderr = child.stderr.take();
self.stdin = stdin;
self.process = Some(child);
*self.process.lock() = Some(child);
// Reset the init flag so the watchdog and stdout handler start fresh.
self.received_init.store(false, Ordering::SeqCst);
// Note: We no longer reset stats here - stats persist across reconnects
// Stats are only reset when explicitly disconnecting via stop()
@@ -416,8 +424,9 @@ impl WslBridge {
let app_clone = app.clone();
let stats_clone = self.stats.clone();
let conv_id = self.conversation_id.clone();
let received_init_clone = self.received_init.clone();
thread::spawn(move || {
handle_stdout(stdout, app_clone, stats_clone, conv_id);
handle_stdout(stdout, app_clone, stats_clone, conv_id, received_init_clone);
});
}
@@ -429,12 +438,31 @@ impl WslBridge {
});
}
// Emit Connected immediately so the frontend can send the greeting message.
// This is intentionally optimistic — Claude Code buffers stdout until stdin receives
// data on Windows/WSL, so we must send something to stdin first or system:init never
// arrives. The received_init flag below tracks whether init actually arrived.
emit_connection_status(
&app,
ConnectionStatus::Connected,
self.conversation_id.clone(),
);
// Watchdog: if system:init never arrives the process is truly hung (e.g. a silent crash
// after spawning). After 5 minutes we kill it so the user isn't stuck forever.
// handle_stdout will surface the error when stdout closes after the kill.
let process_watchdog = self.process.clone();
let received_init_watchdog = self.received_init.clone();
thread::spawn(move || {
thread::sleep(Duration::from_secs(60));
if !received_init_watchdog.load(Ordering::SeqCst) {
if let Some(mut proc) = process_watchdog.lock().take() {
let _ = proc.kill();
let _ = proc.wait();
}
}
});
Ok(())
}
@@ -513,7 +541,10 @@ impl WslBridge {
// Due to persistent bug in Claude Code where ESC/Ctrl+C doesn't work,
// we have to kill the process. This is the only reliable way to stop it.
// See: https://github.com/anthropics/claude-code/issues/3455
if let Some(mut process) = self.process.take() {
// Extract the process first so the MutexGuard is dropped before we mutably
// borrow `self` again via estimate_interrupted_request_cost.
let maybe_process = self.process.lock().take();
if let Some(mut process) = maybe_process {
// Estimate cost for interrupted request before killing
self.estimate_interrupted_request_cost(app);
@@ -643,7 +674,7 @@ impl WslBridge {
}
pub fn stop(&mut self, app: &AppHandle) {
if let Some(mut process) = self.process.take() {
if let Some(mut process) = self.process.lock().take() {
let _ = process.kill();
let _ = process.wait();
}
@@ -674,7 +705,7 @@ impl WslBridge {
}
pub fn is_running(&self) -> bool {
self.process.is_some()
self.process.lock().is_some()
}
pub fn get_working_directory(&self) -> &str {
@@ -697,13 +728,16 @@ fn handle_stdout(
app: AppHandle,
stats: Arc<RwLock<UsageStats>>,
conversation_id: Option<String>,
received_init: Arc<AtomicBool>,
) {
let reader = BufReader::new(stdout);
for line in reader.lines() {
match line {
Ok(line) if !line.is_empty() => {
if let Err(e) = process_json_line(&line, &app, &stats, &conversation_id) {
if let Err(e) =
process_json_line(&line, &app, &stats, &conversation_id, &received_init)
{
tracing::error!("Error processing line: {}", e);
}
}
@@ -715,6 +749,22 @@ fn handle_stdout(
}
}
// If stdout closed before system:init arrived the process exited without initialising.
// Emit an error line so the user understands why the connection failed.
if !received_init.load(Ordering::SeqCst) {
let _ = app.emit(
"claude:output",
OutputEvent {
line_type: "error".to_string(),
content: "Claude Code exited before initialising. Check the working directory and Claude Code installation, then try connecting again.".to_string(),
tool_name: None,
conversation_id: conversation_id.clone(),
cost: None,
parent_tool_use_id: None,
},
);
}
emit_connection_status(&app, ConnectionStatus::Disconnected, conversation_id);
}
@@ -919,6 +969,7 @@ fn process_json_line(
app: &AppHandle,
stats: &Arc<RwLock<UsageStats>>,
conversation_id: &Option<String>,
received_init: &Arc<AtomicBool>,
) -> Result<(), String> {
let message: ClaudeMessage = serde_json::from_str(line)
.map_err(|e| format!("Failed to parse JSON: {} - Line: {}", e, line))?;
@@ -931,6 +982,9 @@ fn process_json_line(
..
} => {
if subtype == "init" {
// Mark as initialised so the watchdog knows the process is healthy.
received_init.store(true, Ordering::SeqCst);
if let Some(id) = session_id {
let _ = app.emit(
"claude:session",