crop mode and nav mode

2026-03-17 08:22:00 +01:00
parent 31598729b8
commit 91e3d36f52
6 changed files with 366 additions and 79 deletions
@@ -12,6 +12,9 @@ default = []
 # Build in nav-only mode: no H.264 video decode, only turn-by-turn text.
 # Saves ~300KB+ PSRAM and significant CPU. No esp_h264 component needed.
 nav-only = ["dep:miniz_oxide"]
+# Crop video: take center 480×320 from 800×480 instead of downscaling.
+# Eliminates bilinear scaling — 1:1 pixel copy is ~40% faster.
+crop-video = []

 [profile.release]
 opt-level = 3
@@ -9,6 +9,7 @@ BINARY_NAME="esp32-android-auto-nav"
 # Parse arguments
 BUILD_ONLY=false
 NAV_ONLY=false
+CROP_VIDEO=false
 CARGO_FEATURES=""

 for arg in "$@"; do
@@ -21,22 +22,36 @@ for arg in "$@"; do
      NAV_ONLY=true
      shift
      ;;
+    -c|--crop)
+      CROP_VIDEO=true
+      shift
+      ;;
    -h|--help)
      echo "Usage: ./build.sh [OPTIONS]"
      echo "Options:"
      echo "  -b, --build-only, --no-flash  Build only, skip flashing prompt"
      echo "  -n, --nav-only                Nav-only mode: text turn-by-turn, no video"
+      echo "  -c, --crop                    Crop mode: center-crop 480×320 from 800×480 (faster)"
      echo "  -h, --help                    Show this help message"
      exit 0
      ;;
  esac
 done

+# Build feature list
+FEATURE_LIST=""
 if [ "$NAV_ONLY" = true ]; then
-    CARGO_FEATURES="--features nav-only"
+    FEATURE_LIST="nav-only"
    echo "📍 Mode: NAV-ONLY (turn-by-turn text, no H.264 video)"
+elif [ "$CROP_VIDEO" = true ]; then
+    FEATURE_LIST="crop-video"
+    echo "🎬 Mode: CROP VIDEO (center 480×320 from 800×480, no scaling)"
 else
-    echo "🎬 Mode: FULL VIDEO (H.264 decode + display)"
+    echo "🎬 Mode: FULL VIDEO (H.264 decode + downscale + display)"
+fi
+
+if [ -n "$FEATURE_LIST" ]; then
+    CARGO_FEATURES="--features $FEATURE_LIST"
 fi

 echo "🔨 Building $BINARY_NAME (release)..."
@@ -16,15 +16,9 @@ CONFIG_SPIRAM=y
 CONFIG_SPIRAM_MODE_QUAD=y
 CONFIG_SPIRAM_SPEED_80M=y
 CONFIG_SPIRAM_USE_MALLOC=y
-# Allocations <= 4KB go to internal DRAM, larger ones to PSRAM.
-# The new strip-by-strip pipeline eliminates the 300KB PSRAM VideoFrame —
-# only the esp_h264 decoder's internal I420 buffers (~576KB) live in PSRAM.
 CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL=4096
-# Reserve internal memory for DMA buffers (76.8KB) + ESP-IDF critical allocations
 CONFIG_SPIRAM_MALLOC_RESERVE_INTERNAL=32768
-# Allow thread stacks in PSRAM (decode+display thread)
 CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY=y
-# Skip PSRAM memtest on boot (saves ~500ms startup)
 CONFIG_SPIRAM_MEMTEST=n

 # Data Cache — maximize for PSRAM performance (H.264 decode reads PSRAM constantly)
@@ -42,10 +36,8 @@ CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
 # LCD I80 bus — use PLL clock source for stable 40MHz pixel clock
 CONFIG_LCD_PERIPH_CLK_SRC_PLL160M=y

-# Bluetooth — BLE only (ESP32-S3 does NOT support Bluetooth Classic)
-CONFIG_BT_ENABLED=y
-CONFIG_BT_BLE_ENABLED=y
-CONFIG_BT_NIMBLE_ENABLED=y
+# Bluetooth — DISABLED to save ~20KB DRAM
+CONFIG_BT_ENABLED=n

 # WiFi — minimize internal SRAM usage (leave room for DMA buffers)
 CONFIG_ESP_WIFI_ENABLED=y
@@ -58,34 +50,28 @@ CONFIG_ESP_WIFI_RX_BA_WIN=4
 CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y

 # H.264 software decoder (esp_h264 component)
-# Dual-task decoder for better FPS on ESP32-S3
-CONFIG_ESP_H264_DECODER_IRAM=1
-CONFIG_ESP_H264_DUAL_TASK=1
+CONFIG_ESP_H264_DECODER_IRAM=y
+CONFIG_ESP_H264_DUAL_TASK=y
+CONFIG_ESP_H264_DUAL_TASK_CORE=1
+CONFIG_ESP_H264_DUAL_TASK_PRIORITY=19

 # TLS — mbedtls for Android Auto TLS handshake
 CONFIG_MBEDTLS_TLS_CLIENT=y
 CONFIG_MBEDTLS_TLS_SERVER=y
 CONFIG_MBEDTLS_SSL_ALPN=y
-# Disable cert bundle — we only use our own AA cert, and server verify is NONE.
-# The full bundle wastes ~60KB of heap when parsed.
 CONFIG_MBEDTLS_CERTIFICATE_BUNDLE=n
 CONFIG_MBEDTLS_HARDWARE_AES=y
 CONFIG_MBEDTLS_HARDWARE_SHA=y
 CONFIG_MBEDTLS_KEY_EXCHANGE_RSA=y
-# Use default allocator (not internal-only) — RSA MPI needs >32KB of temp buffers
-# and 64KB dcache + 32KB icache + 77KB DMA staging exhaust internal DRAM.
-# PSRAM is fine for one-time handshake; AES-GCM encrypt/decrypt uses HW accel.
 CONFIG_MBEDTLS_DEFAULT_MEM_ALLOC=y

-# TCP/IP — larger window for video streaming throughput
-# 32KB window + 32KB send buffer reduces TCP stalls when phone sends
-# bursty H.264 data.  LWIP buffers go to PSRAM (SPIRAM_TRY_ALLOCATE).
+# TCP/IP — 16KB windows for video streaming
 CONFIG_LWIP_MAX_SOCKETS=10
-CONFIG_LWIP_TCP_SND_BUF_DEFAULT=32768
-CONFIG_LWIP_TCP_WND_DEFAULT=32768
-CONFIG_LWIP_TCP_RECVMBOX_SIZE=32
+CONFIG_LWIP_TCP_SND_BUF_DEFAULT=16384
+CONFIG_LWIP_TCP_WND_DEFAULT=16384
+CONFIG_LWIP_TCP_RECVMBOX_SIZE=16

-# Logging — disable dynamic level checks (~10× faster log calls)
+# Logging — disable dynamic level checks
 CONFIG_LOG_DEFAULT_LEVEL_INFO=y
 CONFIG_LOG_DYNAMIC_LEVEL_CONTROL=n
 CONFIG_LOG_TAG_LEVEL_IMPL_NONE=y
@@ -93,10 +79,6 @@ CONFIG_LOG_TAG_LEVEL_IMPL_NONE=y
 # FreeRTOS — 1ms ticks for responsive scheduling
 CONFIG_FREERTOS_HZ=1000

-# Task watchdog — 10s for heavy decode workload
-CONFIG_ESP_TASK_WDT_TIMEOUT_S=10
-# Disable interrupt WDT on Core 1 — long DMA waits during video decode are normal
+# Task watchdog — 15s for heavy decode workload
+CONFIG_ESP_TASK_WDT_TIMEOUT_S=15
 CONFIG_ESP_INT_WDT_CHECK_CPU1=n
-
-# mDNS — for Android Auto service discovery (_androidauto._tcp)
-CONFIG_MDNS_MAX_SERVICES=4
@@ -522,19 +522,48 @@ pub fn i420_to_rgb565_strip(
        let v_row = unsafe { v_plane.add(uv_row_off) };
        let out_off = dy_local * dst_w_us;

-        for dx in 0..dst_w_us {
+        // Process 2 pixels per iteration — adjacent pixels often share
+        // the same UV values, and this halves loop overhead.
+        let mut dx = 0usize;
+        let dst_w_pairs = dst_w_us & !1; // round down to even
+        while dx < dst_w_pairs {
+            unsafe {
+                let src_x0 = *x_map.get_unchecked(dx) as usize;
+                let src_x1 = *x_map.get_unchecked(dx + 1) as usize;
+                let uv_x0 = src_x0 >> 1;
+                let uv_x1 = src_x1 >> 1;
+
+                // Pixel 0
+                let y_val0 = *lut.y_r.get_unchecked(*y_row.add(src_x0) as usize);
+                let u_idx0 = *u_row.add(uv_x0) as usize;
+                let v_idx0 = *v_row.add(uv_x0) as usize;
+                let r0 = clamp8(y_val0 + *lut.v_r.get_unchecked(v_idx0));
+                let g0 = clamp8(y_val0 - *lut.v_g.get_unchecked(v_idx0) - *lut.u_g.get_unchecked(u_idx0));
+                let b0 = clamp8(y_val0 + *lut.u_b.get_unchecked(u_idx0));
+                *out.get_unchecked_mut(out_off + dx) = ((r0 >> 3) << 11) | ((g0 >> 2) << 5) | (b0 >> 3);
+
+                // Pixel 1
+                let y_val1 = *lut.y_r.get_unchecked(*y_row.add(src_x1) as usize);
+                let u_idx1 = *u_row.add(uv_x1) as usize;
+                let v_idx1 = *v_row.add(uv_x1) as usize;
+                let r1 = clamp8(y_val1 + *lut.v_r.get_unchecked(v_idx1));
+                let g1 = clamp8(y_val1 - *lut.v_g.get_unchecked(v_idx1) - *lut.u_g.get_unchecked(u_idx1));
+                let b1 = clamp8(y_val1 + *lut.u_b.get_unchecked(u_idx1));
+                *out.get_unchecked_mut(out_off + dx + 1) = ((r1 >> 3) << 11) | ((g1 >> 2) << 5) | (b1 >> 3);
+            }
+            dx += 2;
+        }
+        // Handle odd last pixel if display width is odd
+        if dx < dst_w_us {
            unsafe {
                let src_x = *x_map.get_unchecked(dx) as usize;
                let uv_x = src_x >> 1;
-
                let y_val = *lut.y_r.get_unchecked(*y_row.add(src_x) as usize);
                let u_idx = *u_row.add(uv_x) as usize;
                let v_idx = *v_row.add(uv_x) as usize;
-
                let r = clamp8(y_val + *lut.v_r.get_unchecked(v_idx));
                let g = clamp8(y_val - *lut.v_g.get_unchecked(v_idx) - *lut.u_g.get_unchecked(u_idx));
                let b = clamp8(y_val + *lut.u_b.get_unchecked(u_idx));
-
                *out.get_unchecked_mut(out_off + dx) = ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);
            }
        }
@@ -606,6 +635,93 @@ pub fn i420_to_rgb565_bilinear(
    }
 }

+/// Convert a horizontal strip of I420 to RGB565 using center-crop (no scaling).
+///
+/// Extracts the center `dst_w × dst_h` region from the `src_w × src_h` frame.
+/// Each I420 pixel is converted 1:1 to RGB565 — no bilinear/nearest-neighbor
+/// interpolation, no scaling math. This is ~40% faster than the downscale path
+/// because the inner loop is a simple sequential read with no x_map lookup.
+///
+/// `dst_y_start` / `strip_h`: which output rows to produce (for strip-based DMA).
+/// `out`: DMA staging buffer, must hold `dst_w * strip_h` u16 entries.
+#[cfg(feature = "crop-video")]
+pub fn i420_to_rgb565_strip_crop(
+    i420: &[u8],
+    src_w: u32,
+    src_h: u32,
+    dst_w: u32,
+    dst_h: u32,
+    dst_y_start: u32,
+    strip_h: u32,
+    out: &mut [u16],
+) {
+    let lut = get_yuv_lut();
+    let src_pixels = (src_w * src_h) as usize;
+    let y_plane = i420.as_ptr();
+    let u_plane = unsafe { y_plane.add(src_pixels) };
+    let v_plane = unsafe { u_plane.add(src_pixels / 4) };
+    let uv_stride = (src_w / 2) as usize;
+    let dst_w_us = dst_w as usize;
+
+    // Crop offsets: center the dst region within the src frame
+    let crop_x = ((src_w - dst_w) / 2) as usize;
+    let crop_y = ((src_h - dst_h) / 2) as usize;
+
+    for dy_local in 0..strip_h as usize {
+        let dy = dst_y_start as usize + dy_local;
+        if dy >= dst_h as usize {
+            break;
+        }
+
+        let src_y = crop_y + dy;
+        let y_row = unsafe { y_plane.add(src_y * src_w as usize + crop_x) };
+        let uv_row_off = (src_y / 2) * uv_stride + crop_x / 2;
+        let u_row = unsafe { u_plane.add(uv_row_off) };
+        let v_row = unsafe { v_plane.add(uv_row_off) };
+        let out_off = dy_local * dst_w_us;
+
+        // 1:1 pixel copy — no scaling, just YUV→RGB565 conversion.
+        // Process 2 pixels at a time (share UV for adjacent pixel pairs).
+        let mut dx = 0usize;
+        let dst_w_pairs = dst_w_us & !1;
+        while dx < dst_w_pairs {
+            unsafe {
+                let uv_x0 = dx >> 1;
+                let uv_x1 = (dx + 1) >> 1;
+
+                let y_val0 = *lut.y_r.get_unchecked(*y_row.add(dx) as usize);
+                let u_idx0 = *u_row.add(uv_x0) as usize;
+                let v_idx0 = *v_row.add(uv_x0) as usize;
+                let r0 = clamp8(y_val0 + *lut.v_r.get_unchecked(v_idx0));
+                let g0 = clamp8(y_val0 - *lut.v_g.get_unchecked(v_idx0) - *lut.u_g.get_unchecked(u_idx0));
+                let b0 = clamp8(y_val0 + *lut.u_b.get_unchecked(u_idx0));
+                *out.get_unchecked_mut(out_off + dx) = ((r0 >> 3) << 11) | ((g0 >> 2) << 5) | (b0 >> 3);
+
+                let y_val1 = *lut.y_r.get_unchecked(*y_row.add(dx + 1) as usize);
+                let u_idx1 = *u_row.add(uv_x1) as usize;
+                let v_idx1 = *v_row.add(uv_x1) as usize;
+                let r1 = clamp8(y_val1 + *lut.v_r.get_unchecked(v_idx1));
+                let g1 = clamp8(y_val1 - *lut.v_g.get_unchecked(v_idx1) - *lut.u_g.get_unchecked(u_idx1));
+                let b1 = clamp8(y_val1 + *lut.u_b.get_unchecked(u_idx1));
+                *out.get_unchecked_mut(out_off + dx + 1) = ((r1 >> 3) << 11) | ((g1 >> 2) << 5) | (b1 >> 3);
+            }
+            dx += 2;
+        }
+        if dx < dst_w_us {
+            unsafe {
+                let uv_x = dx >> 1;
+                let y_val = *lut.y_r.get_unchecked(*y_row.add(dx) as usize);
+                let u_idx = *u_row.add(uv_x) as usize;
+                let v_idx = *v_row.add(uv_x) as usize;
+                let r = clamp8(y_val + *lut.v_r.get_unchecked(v_idx));
+                let g = clamp8(y_val - *lut.v_g.get_unchecked(v_idx) - *lut.u_g.get_unchecked(u_idx));
+                let b = clamp8(y_val + *lut.u_b.get_unchecked(u_idx));
+                *out.get_unchecked_mut(out_off + dx) = ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -55,8 +55,10 @@ fn main() -> Result<()> {
    log::info!("=== ESP32 Android Auto Navigation Head Unit ===");
    #[cfg(feature = "nav-only")]
    log::info!("Mode: NAV-ONLY (turn-by-turn text, no video decode)");
-    #[cfg(not(feature = "nav-only"))]
-    log::info!("Mode: FULL VIDEO (H.264 decode + display)");
+    #[cfg(all(not(feature = "nav-only"), feature = "crop-video"))]
+    log::info!("Mode: CROP VIDEO (center 480×320 from 800×480, no scaling)");
+    #[cfg(all(not(feature = "nav-only"), not(feature = "crop-video")))]
+    log::info!("Mode: FULL VIDEO (H.264 decode + downscale + display)");

    // Check PSRAM availability
    let free_psram = unsafe {
@@ -133,7 +135,7 @@ fn main() -> Result<()> {
    #[cfg(not(feature = "nav-only"))]
    let decode_tx = {
        // Channel for raw H.264 NAL data → decode+display thread.
-        let (decode_tx, decode_rx) = mpsc::sync_channel::<Vec<u8>>(2);
+        let (decode_tx, decode_rx) = mpsc::sync_channel::<Vec<u8>>(4);

        // Spawn navigation UI thread (log-only in video mode — LCD is owned by video)
        let _ui_thread = thread::Builder::new()
@@ -862,11 +864,78 @@ fn png_unfilter(data: &mut [u8], width: usize, height: usize, channels: usize) {
 ///
 /// Only compiled in video mode (not nav-only).
 #[cfg(not(feature = "nav-only"))]
+/// Work item sent to the converter helper thread.
+/// Contains raw pointers to shared I420 input and DMA output buffers.
+struct ConvertWork {
+    i420_ptr: *const u8,
+    i420_len: usize,
+    src_w: u32,
+    src_h: u32,
+    dst_w: u32,
+    dst_h: u32,
+    dst_y_start: u32,
+    strip_h: u32,
+    out_ptr: *mut u16,
+    out_len: usize,
+}
+
+// SAFETY: Pointers are valid for the duration of the work item.
+// The main thread waits for `done_rx` before touching the buffers again.
+unsafe impl Send for ConvertWork {}
+
+/// Converter worker thread — sits on core opposite to the decode thread.
+/// Receives half-strip conversion jobs and signals completion.
+fn converter_worker(
+    rx: mpsc::Receiver<ConvertWork>,
+    done_tx: mpsc::SyncSender<()>,
+) {
+    loop {
+        let work = match rx.recv() {
+            Ok(w) => w,
+            Err(_) => return,
+        };
+        unsafe {
+            let i420 = core::slice::from_raw_parts(work.i420_ptr, work.i420_len);
+            let out = core::slice::from_raw_parts_mut(work.out_ptr, work.out_len);
+            #[cfg(feature = "crop-video")]
+            decoder::i420_to_rgb565_strip_crop(
+                i420,
+                work.src_w, work.src_h,
+                work.dst_w, work.dst_h,
+                work.dst_y_start, work.strip_h,
+                out,
+            );
+            #[cfg(not(feature = "crop-video"))]
+            decoder::i420_to_rgb565_strip(
+                i420,
+                work.src_w, work.src_h,
+                work.dst_w, work.dst_h,
+                work.dst_y_start, work.strip_h,
+                out,
+            );
+        }
+        let _ = done_tx.send(());
+    }
+}
+
 fn decode_display_loop(decode_rx: mpsc::Receiver<Vec<u8>>, lcd: display::Display) {
-    log::info!("Decode+display thread started (strip-by-strip direct-to-DMA)");
+    log::info!("Decode+display thread started (display every frame)");
+
+    // Non-crop mode: spawn converter helper for dual-core strip splitting
+    #[cfg(not(feature = "crop-video"))]
+    let (work_tx, work_rx) = mpsc::sync_channel::<ConvertWork>(1);
+    #[cfg(not(feature = "crop-video"))]
+    let (done_tx, done_rx) = mpsc::sync_channel::<()>(1);
+    #[cfg(not(feature = "crop-video"))]
+    let _converter = thread::Builder::new()
+        .name("converter".into())
+        .stack_size(4096)
+        .spawn(move || converter_worker(work_rx, done_tx))
+        .expect("converter thread");

    let mut dec: Option<decoder::H264Decoder> = None;
    let mut frame_count: u64 = 0;
+    let mut display_count: u64 = 0;
    let mut skip_count: u64 = 0;
    let strip_h: u32 = display::STRIP_LINES as u32;

@@ -880,16 +949,16 @@ fn decode_display_loop(decode_rx: mpsc::Receiver<Vec<u8>>, lcd: display::Display
            }
        };

-        // Drain all queued chunks: decode each one to maintain H.264 state,
-        // but skip the expensive YUV→RGB565 conversion (discard mode).
-        // Only the latest chunk will get full conversion + display.
+        // Drain all queued chunks: discard older frames and keep only
+        // the latest.  We do NOT decode discarded frames — each
+        // esp_h264_dec_process call takes ~300ms for 800×480, so
+        // decoding throwaway frames was the #1 bottleneck (0.9 fps!).
+        // P-frames may glitch briefly until the next keyframe, but
+        // that's far better than wasting 1+ second per iteration.
        loop {
            match decode_rx.try_recv() {
                Ok(next) => {
-                    if let Some(d) = &mut dec {
-                        let _ = d.decode_into(&data, &mut []); // decode-only
-                        skip_count += 1;
-                    }
+                    skip_count += 1;
                    data = next;
                }
                Err(_) => break,
@@ -912,11 +981,14 @@ fn decode_display_loop(decode_rx: mpsc::Receiver<Vec<u8>>, lcd: display::Display
        let d = dec.as_mut().unwrap();

        // Decode the latest NAL → get raw I420 pointer
+        let t0 = unsafe { esp_idf_sys::esp_timer_get_time() };
        match d.decode_raw(&data) {
            Ok(Some((i420_ptr, i420_len))) => {
+                let t1 = unsafe { esp_idf_sys::esp_timer_get_time() };
                frame_count += 1;

-                if frame_count % 60 == 1 {
+                display_count += 1;
+                if display_count % 30 == 1 {
                    let free_psram = unsafe {
                        esp_idf_sys::heap_caps_get_free_size(esp_idf_sys::MALLOC_CAP_SPIRAM)
                    };
@@ -924,36 +996,85 @@ fn decode_display_loop(decode_rx: mpsc::Receiver<Vec<u8>>, lcd: display::Display
                        esp_idf_sys::heap_caps_get_free_size(esp_idf_sys::MALLOC_CAP_INTERNAL)
                    };
                    log::info!(
-                        "Frame #{} (skipped {}, PSRAM {}KB, DRAM {}KB free)",
-                        frame_count, skip_count, free_psram / 1024, free_dram / 1024,
+                        "Display #{} (decoded {}, skipped {}, PSRAM {}KB, DRAM {}KB free)",
+                        display_count, frame_count, skip_count, free_psram / 1024, free_dram / 1024,
                    );
                }

                // SAFETY: I420 data is component-owned, valid until next decode call.
-                // We consume it fully here before the next loop iteration.
                let i420 = unsafe { core::slice::from_raw_parts(i420_ptr, i420_len) };

+                let src_w = d.source_width();
+                let src_h = d.source_height();
                let (dst_w, dst_h) = d.output_dimensions();

-                // Strip-by-strip: convert I420→RGB565 directly into DMA staging
-                // SRAM buffers and push to LCD.  Alternating buffers overlap
-                // DMA transfer with CPU conversion (double-buffered pipeline).
+                // Strip-by-strip rendering.
+                // Crop mode: single-threaded 1:1 copy (trivially fast, ~2ms/strip).
+                // Scale mode: dual-core split (worker + main process half each).
+                // Double-buffered DMA pipeline overlaps transfer with computation.
                let mut buf_idx: usize = 0;
                for y in (0..dst_h).step_by(strip_h as usize) {
                    let h = strip_h.min(dst_h - y);
                    let dma_buf = lcd.dma_stage_mut(buf_idx);

-                    decoder::i420_to_rgb565_strip(
-                        i420,
-                        d.source_width(), d.source_height(),
-                        dst_w, dst_h,
-                        y, h,
-                        dma_buf,
-                    );
+                    #[cfg(feature = "crop-video")]
+                    {
+                        // Crop: simple 1:1 copy, no dual-core needed
+                        decoder::i420_to_rgb565_strip_crop(
+                            i420,
+                            src_w, src_h,
+                            dst_w, dst_h,
+                            y, h,
+                            &mut dma_buf[..(h * dst_w) as usize],
+                        );
+                    }
+
+                    #[cfg(not(feature = "crop-video"))]
+                    {
+                        // Scale: split strip in half for dual-core
+                        let half_h = h / 2;
+                        let lower_h = h - half_h;
+
+                        // Send top half to converter worker
+                        let _ = work_tx.send(ConvertWork {
+                            i420_ptr: i420.as_ptr(),
+                            i420_len: i420.len(),
+                            src_w, src_h,
+                            dst_w, dst_h,
+                            dst_y_start: y,
+                            strip_h: half_h,
+                            out_ptr: dma_buf.as_mut_ptr(),
+                            out_len: (half_h * dst_w) as usize,
+                        });
+
+                        // Convert bottom half on this thread
+                        let lower_offset = (half_h * dst_w) as usize;
+                        decoder::i420_to_rgb565_strip(
+                            i420,
+                            src_w, src_h,
+                            dst_w, dst_h,
+                            y + half_h, lower_h,
+                            &mut dma_buf[lower_offset..lower_offset + (lower_h * dst_w) as usize],
+                        );
+
+                        // Wait for worker to finish top half
+                        let _ = done_rx.recv();
+                    }

                    lcd.flush_strip(y as u16, h as u16, buf_idx);
                    buf_idx ^= 1;
                }
+
+                let t2 = unsafe { esp_idf_sys::esp_timer_get_time() };
+                if display_count % 30 == 1 {
+                    let decode_ms = (t1 - t0) / 1000;
+                    let render_ms = (t2 - t1) / 1000;
+                    log::info!("⏱ decode={}ms render={}ms total={}ms", decode_ms, render_ms, decode_ms + render_ms);
+                }
+
+                // Yield to IDLE0 after each frame so the task watchdog
+                // doesn't trigger (decode + convert can take 300ms+).
+                unsafe { esp_idf_sys::vTaskDelay(1); }
            }
            Ok(None) => {} // SPS/PPS/SEI — no image data (normal at stream start)
            Err(e) => {
@@ -963,7 +1084,7 @@ fn decode_display_loop(decode_rx: mpsc::Receiver<Vec<u8>>, lcd: display::Display
    }
 }

-/// Touch polling loop — reads FT6336U at ~30Hz and sends events.
+/// Touch polling loop — reads FT6336U at ~60Hz and sends events.
 fn touch_poll_loop(mut touch: touch::Touch<'static>, tx: mpsc::Sender<touch::TouchEvent>) {
    log::info!("Touch polling thread started");
    loop {
@@ -971,6 +1092,6 @@ fn touch_poll_loop(mut touch: touch::Touch<'static>, tx: mpsc::Sender<touch::Tou
            log::debug!("👆 Touch: ({}, {}) pressed={}", event.x, event.y, event.pressed);
            let _ = tx.send(event);
        }
-        std::thread::sleep(Duration::from_millis(33)); // ~30Hz
+        std::thread::sleep(Duration::from_millis(16)); // ~60Hz
    }
 }
@@ -8,7 +8,9 @@
 //! 4. Channel message dispatching (navigation, video stub, audio stub, etc.)

 use std::io::{Read, Write};
+use std::net::TcpStream;
 use std::sync::mpsc;
+use std::time::Duration;

 use anyhow::{Context, Result, bail};
 use protobuf::{Enum, Message};
@@ -60,8 +62,8 @@ impl Default for ChannelMap {
 /// `nav_tx` sends navigation events to the UI thread.
 /// `decode_tx` sends raw H.264 NAL data to the long-lived decode+display thread.
 /// `touch_rx` receives touch events from the touch polling thread.
-pub fn run_session<S: Read + Write>(
-    stream: &mut S,
+pub fn run_session(
+    stream: &mut TcpStream,
    config: &HeadUnitConfig,
    nav_tx: &mpsc::Sender<NavEvent>,
    decode_tx: &mpsc::SyncSender<Vec<u8>>,
@@ -90,25 +92,62 @@ pub fn run_session<S: Read + Write>(
    let mut touch_pressed: bool = false;
    let mut touch_event_count: u64 = 0;
    let mut loop_count: u32 = 0;
+    let mut last_focus_kick_us: u64 = 0;
+
+    // Set read timeout for peek polling — allows us to drain touch
+    // events every ~50ms even when the phone isn't sending data.
+    stream.set_read_timeout(Some(Duration::from_millis(50)))?;

    loop {
-        let frame = reader.read_frame(stream, &mut tls)?;
-        let channel_id = frame.header.channel_id;
-        let is_control_bit = frame.header.frame.get_control();
-        let is_encrypted = frame.header.frame.get_encryption();
-
        // Yield every 10 iterations so IDLE0 can run (prevents task WDT)
        loop_count += 1;
        if loop_count % 10 == 0 {
            std::thread::yield_now();
        }

-        // Drain any pending touch events every iteration (video arrives ~30fps,
-        // so touch latency is at most ~33ms — good enough for interaction).
+        // Drain any pending touch events BEFORE blocking on read.
+        // This ensures touch events are sent promptly even when
+        // the phone isn't sending data.
+        let mut sent_touch = false;
        while let Ok(te) = touch_rx.try_recv() {
            send_touch_event(stream, &mut tls, ch.input, te, &mut touch_pressed, &mut touch_event_count)?;
+            sent_touch = true;
        }

+        // If touch events were sent while phone may be idle, send
+        // VideoFocusIndication to prompt it to render and send fresh frames.
+        if sent_touch && video_session.is_some() {
+            let now_us = unsafe { esp_idf_sys::esp_timer_get_time() } as u64;
+            if now_us.wrapping_sub(last_focus_kick_us) > 500_000 {
+                let kick = channels::video_focus_frame_unrequested(ch.video, true);
+                if let Err(e) = frame::write_frame(stream, &kick, &mut tls) {
+                    log::warn!("Video focus kick failed: {:?}", e);
+                }
+                last_focus_kick_us = now_us;
+            }
+        }
+
+        // Peek to check if data is available (blocks up to 50ms).
+        // peek() doesn't consume data, so partial-read corruption is impossible.
+        let mut peek_buf = [0u8; 1];
+        match stream.peek(&mut peek_buf) {
+            Ok(_) => {} // Data available — proceed to read frame
+            Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock
+                       || e.kind() == std::io::ErrorKind::TimedOut => {
+                continue; // No data — loop back to drain touch events
+            }
+            Err(e) => return Err(e).context("peeking socket for data"),
+        }
+
+        // Data is on the socket — read the full frame with generous timeout
+        stream.set_read_timeout(Some(Duration::from_secs(5)))?;
+        let frame = reader.read_frame(stream, &mut tls)?;
+        stream.set_read_timeout(Some(Duration::from_millis(50)))?;
+
+        let channel_id = frame.header.channel_id;
+        let is_control_bit = frame.header.frame.get_control();
+        let is_encrypted = frame.header.frame.get_encryption();
+
        // Log incoming frames at debug level (very high volume)
        log::debug!(
            "⬅️  ch={} ctrl={} enc={} len={} data={:02x?}",
@@ -434,12 +473,23 @@ fn send_touch_event<S: Read + Write>(
    // FT6336U on WT32-SC01 Plus with MADCTL MV|MY (landscape):
    //   raw_x: 0..319 maps to display Y (top→bottom)
    //   raw_y: 0..479 maps to display X (right→left, inverted)
-    // Verified empirically:
-    //   bottom-right raw(273,11) → AA(780,409) ✓
-    //   bottom-left  raw(291,446) → AA(55,436) ✓
-    //   top-left     raw(16,453) → AA(43,24)  ✓
-    let aa_x = (479u32.saturating_sub(te.y as u32)) * 800 / 480;
-    let aa_y = (te.x as u32) * 480 / 320;
+    //
+    // In crop mode, the display shows the center 480×320 of the 800×480 frame,
+    // so we add the crop offset (160, 80) to map from display to AA coordinates.
+    // In downscale mode, the display shows the full 800×480 scaled to 480×320.
+    #[cfg(feature = "crop-video")]
+    let (aa_x, aa_y) = {
+        let disp_x = 479u32.saturating_sub(te.y as u32);
+        let disp_y = te.x as u32;
+        // Crop offset: (800-480)/2 = 160, (480-320)/2 = 80
+        (disp_x + 160, disp_y + 80)
+    };
+    #[cfg(not(feature = "crop-video"))]
+    let (aa_x, aa_y) = {
+        let aa_x = (479u32.saturating_sub(te.y as u32)) * 800 / 480;
+        let aa_y = (te.x as u32) * 480 / 320;
+        (aa_x, aa_y)
+    };

    let action = if te.pressed {
        if *touch_pressed {