chess_corners_core/
response.rs

1//! Dense ChESS response computation for 8-bit grayscale inputs.
2use crate::ring::RingOffsets;
3use crate::{ChessParams, ResponseMap};
4
5#[cfg(feature = "rayon")]
6use rayon::prelude::*;
7
8#[cfg(feature = "simd")]
9use core::simd::Simd;
10
11#[cfg(feature = "simd")]
12use std::simd::prelude::{SimdInt, SimdUint};
13
14#[cfg(feature = "simd")]
15const LANES: usize = 16;
16
17#[cfg(feature = "simd")]
18type U8s = Simd<u8, LANES>;
19
20#[cfg(feature = "simd")]
21type I16s = Simd<i16, LANES>;
22
23#[cfg(feature = "simd")]
24type I32s = Simd<i32, LANES>;
25
26#[cfg(feature = "tracing")]
27use tracing::instrument;
28
29/// Rectangular region of interest, in image coordinates `[x0, x1) × [y0, y1)`.
30///
31/// Invariant: `x0 < x1` and `y0 < y1`.
32#[derive(Clone, Copy, Debug)]
33pub struct Roi {
34    x0: usize,
35    y0: usize,
36    x1: usize,
37    y1: usize,
38}
39
40impl Roi {
41    /// Create a new ROI. Returns `None` if `x0 >= x1` or `y0 >= y1`.
42    pub fn new(x0: usize, y0: usize, x1: usize, y1: usize) -> Option<Self> {
43        if x0 < x1 && y0 < y1 {
44            Some(Self { x0, y0, x1, y1 })
45        } else {
46            None
47        }
48    }
49
50    #[inline]
51    pub fn x0(&self) -> usize {
52        self.x0
53    }
54    #[inline]
55    pub fn y0(&self) -> usize {
56        self.y0
57    }
58    #[inline]
59    pub fn x1(&self) -> usize {
60        self.x1
61    }
62    #[inline]
63    pub fn y1(&self) -> usize {
64        self.y1
65    }
66}
67
68#[inline]
69fn ring_from_params(params: &ChessParams) -> (RingOffsets, &'static [(i32, i32); 16]) {
70    let ring = params.ring();
71    (ring, ring.offsets())
72}
73
74/// Compute the dense ChESS response for an 8-bit grayscale image.
75///
76/// The response at each valid pixel center is computed from a 16‑sample ring
77/// around the pixel and a 5‑pixel cross at the center. For a given center
78/// `c`, let `s[0..16)` be the ring samples in the canonical order:
79///
80/// - `SR` (sum of “square” responses) compares four opposite quadrants on
81///   the ring:
82///
83///   ```text
84///   SR = sum_{k=0..3} | (s[k] + s[k+8]) - (s[k+4] + s[k+12]) |
85///   ```
86///
87/// - `DR` (sum of “difference” responses) enforces edge‑like structure:
88///
89///   ```text
90///   DR = sum_{k=0..7} | s[k] - s[k+8] |
91///   ```
92///
93/// - `μₙ` is the mean of all 16 ring samples.
94/// - `μₗ` is the local mean of the 5‑pixel cross at the center
95///   (`c`, `north`, `south`, `east`, `west`).
96///
97/// The final ChESS response is:
98///
99/// ```text
100/// R = SR - DR - 16 * |μₙ - μₗ|
101/// ```
102///
103/// where high positive values correspond to chessboard‑like corners.
104///
105/// # Implementation strategy
106///
107/// Internally the image is processed row‑by‑row, but only pixels whose
108/// full ring lies inside the image bounds are evaluated; border pixels
109/// are left at zero in the returned [`ResponseMap`].
110///
111/// - Without any features, `chess_response_u8` uses a straightforward
112///   nested `for y { for x { ... } }` scalar loop and relies on the
113///   compiler’s auto‑vectorization in release builds.
114/// - With the `rayon` feature, the work is split into independent row
115///   slices and processed in parallel using `rayon::par_chunks_mut`.
116/// - With the `simd` feature, the inner loop over `x` is rewritten to
117///   operate on `LANES` pixels at a time using portable SIMD vectors
118///   (currently 16 lanes of `u8`). The ring samples are gathered into
119///   SIMD registers, `SR`/`DR`/`μₙ` are accumulated in integer vectors,
120///   and the final response is written back per lane.
121/// - With both `rayon` and `simd` enabled, each row is processed in
122///   parallel *and* each row uses the SIMD‑accelerated inner loop.
123///
124/// All feature combinations produce the same output values (within a
125/// small tolerance for floating‑point rounding), and differ only in
126/// performance characteristics.
127#[cfg_attr(
128    feature = "tracing",
129    instrument(level = "info", skip(img, params), fields(w, h))
130)]
131pub fn chess_response_u8(img: &[u8], w: usize, h: usize, params: &ChessParams) -> ResponseMap {
132    // rayon path compiled only when feature is enabled
133    #[cfg(feature = "rayon")]
134    {
135        compute_response_parallel(img, w, h, params)
136    }
137    #[cfg(not(feature = "rayon"))]
138    {
139        compute_response_sequential(img, w, h, params)
140    }
141}
142
143/// Always uses the scalar implementation (no rayon, no SIMD),
144/// useful for reference/golden testing.
145pub fn chess_response_u8_scalar(
146    img: &[u8],
147    w: usize,
148    h: usize,
149    params: &ChessParams,
150) -> ResponseMap {
151    compute_response_sequential_scalar(img, w, h, params)
152}
153
154/// Compute the ChESS response only inside a rectangular ROI of the image.
155///
156/// The ROI is given in image coordinates [x0, x1) × [y0, y1) via [`Roi`]. The
157/// returned [`ResponseMap`] has width (x1 - x0) and height (y1 - y0), with
158/// coordinates relative to (x0, y0).
159///
160/// Pixels where the ChESS ring would go out of bounds (w.r.t. the *full*
161/// image) are left at 0.0, and will be ignored by the detector because they
162/// lie inside the border margin. Internally this reuses the same scalar,
163/// SIMD, and optional `rayon` row kernels as [`chess_response_u8`], so ROI
164/// refinement benefits from the same feature combinations as the full-frame
165/// response path.
166#[cfg_attr(
167    feature = "tracing",
168    instrument(
169        level = "debug",
170        skip(img, params),
171        fields(img_w, img_h, roi_w = roi.x1 - roi.x0, roi_h = roi.y1 - roi.y0)
172    )
173)]
174pub fn chess_response_u8_patch(
175    img: &[u8],
176    img_w: usize,
177    img_h: usize,
178    params: &ChessParams,
179    roi: Roi,
180) -> ResponseMap {
181    let Roi { x0, y0, x1, y1 } = roi;
182
183    // Clamp ROI to the image bounds
184    let x0 = x0.min(img_w);
185    let y0 = y0.min(img_h);
186    let x1 = x1.min(img_w);
187    let y1 = y1.min(img_h);
188
189    if x1 <= x0 || y1 <= y0 {
190        return ResponseMap {
191            w: 0,
192            h: 0,
193            data: Vec::new(),
194        };
195    }
196
197    let patch_w = x1 - x0;
198    let patch_h = y1 - y0;
199    let mut data = vec![0.0f32; patch_w * patch_h];
200
201    let (ring_kind, ring) = ring_from_params(params);
202    let r = ring_kind.radius() as i32;
203
204    // Safe region for ring centers in *global* image coordinates
205    let gx0 = r as usize;
206    let gy0 = r as usize;
207    let gx1 = img_w - r as usize;
208    let gy1 = img_h - r as usize;
209
210    for py in 0..patch_h {
211        let gy = y0 + py;
212        if gy < gy0 || gy >= gy1 {
213            continue;
214        }
215
216        // Global x-range where the ring is valid on this row.
217        let row_gx0 = x0.max(gx0);
218        let row_gx1 = x1.min(gx1);
219        if row_gx0 >= row_gx1 {
220            continue;
221        }
222
223        let row = &mut data[py * patch_w..(py + 1) * patch_w];
224        let rel_start = row_gx0 - x0;
225        let rel_end = row_gx1 - x0;
226        let dst_row = &mut row[rel_start..rel_end];
227
228        #[cfg(feature = "simd")]
229        {
230            compute_row_range_simd(img, img_w, gy as i32, ring, dst_row, row_gx0, row_gx1);
231        }
232
233        #[cfg(not(feature = "simd"))]
234        {
235            compute_row_range_scalar(img, img_w, gy as i32, ring, dst_row, row_gx0, row_gx1);
236        }
237    }
238
239    ResponseMap {
240        w: patch_w,
241        h: patch_h,
242        data,
243    }
244}
245
246#[cfg(not(feature = "rayon"))]
247fn compute_response_sequential(
248    img: &[u8],
249    w: usize,
250    h: usize,
251    params: &ChessParams,
252) -> ResponseMap {
253    let (ring_kind, ring) = ring_from_params(params);
254    let r = ring_kind.radius() as i32;
255
256    let mut data = vec![0.0f32; w * h];
257
258    // only evaluate where full ring fits
259    let x0 = r as usize;
260    let y0 = r as usize;
261    let x1 = w - r as usize;
262    let y1 = h - r as usize;
263
264    for y in y0..y1 {
265        let row = &mut data[y * w..(y + 1) * w];
266        let dst_row = &mut row[x0..x1];
267
268        #[cfg(feature = "simd")]
269        {
270            compute_row_range_simd(img, w, y as i32, ring, dst_row, x0, x1);
271        }
272
273        #[cfg(not(feature = "simd"))]
274        {
275            compute_row_range_scalar(img, w, y as i32, ring, dst_row, x0, x1);
276        }
277    }
278
279    ResponseMap { w, h, data }
280}
281
282fn compute_response_sequential_scalar(
283    img: &[u8],
284    w: usize,
285    h: usize,
286    params: &ChessParams,
287) -> ResponseMap {
288    let (ring_kind, ring) = ring_from_params(params);
289    let r = ring_kind.radius() as i32;
290
291    let mut data = vec![0.0f32; w * h];
292
293    // only evaluate where full ring fits
294    let x0 = r as usize;
295    let y0 = r as usize;
296    let x1 = w - r as usize;
297    let y1 = h - r as usize;
298
299    for y in y0..y1 {
300        let row = &mut data[y * w..(y + 1) * w];
301        let dst_row = &mut row[x0..x1];
302        compute_row_range_scalar(img, w, y as i32, ring, dst_row, x0, x1);
303    }
304
305    ResponseMap { w, h, data }
306}
307
308#[cfg(feature = "rayon")]
309fn compute_response_parallel(img: &[u8], w: usize, h: usize, params: &ChessParams) -> ResponseMap {
310    let (ring_kind, ring) = ring_from_params(params);
311    let r = ring_kind.radius() as i32;
312    let mut data = vec![0.0f32; w * h];
313
314    // ring margin
315    let x0 = r as usize;
316    let y0 = r as usize;
317    let x1 = w - r as usize;
318    let y1 = h - r as usize;
319
320    // Parallelize over rows. We keep the exact same logic and write
321    // each row's slice independently.
322    data.par_chunks_mut(w).enumerate().for_each(|(y, row)| {
323        let y_i = y as i32;
324        if y_i < y0 as i32 || y_i >= y1 as i32 {
325            return;
326        }
327
328        let dst_row = &mut row[x0..x1];
329
330        #[cfg(feature = "simd")]
331        {
332            compute_row_range_simd(img, w, y_i, ring, dst_row, x0, x1);
333        }
334
335        #[cfg(not(feature = "simd"))]
336        {
337            compute_row_range_scalar(img, w, y_i, ring, dst_row, x0, x1);
338        }
339    });
340
341    ResponseMap { w, h, data }
342}
343
344// Fallback stub when rayon feature is off so the name still exists
345#[cfg(not(feature = "rayon"))]
346#[allow(dead_code)]
347fn compute_response_parallel(img: &[u8], w: usize, h: usize, params: &ChessParams) -> ResponseMap {
348    compute_response_sequential(img, w, h, params)
349}
350
351/// Low-level ChESS response at a single pixel center.
352///
353/// This is the scalar reference implementation used by both the sequential
354/// and SIMD paths:
355///
356/// - gathers 16 ring samples around `(x, y)` using the offsets defined in
357///   [`crate::ring`],
358/// - computes `SR`, `DR`, the ring mean `μₙ`, and the 5‑pixel local mean
359///   `μₗ`, and
360/// - returns `R = SR - DR - 16 * |μₙ - μₗ|` as a `f32`.
361///
362/// Callers are responsible for ensuring that `(x, y)` is far enough from the
363/// image border so that all ring and 5‑pixel cross accesses are in‑bounds.
364#[inline(always)]
365fn chess_response_at_u8(img: &[u8], w: usize, x: i32, y: i32, ring: &[(i32, i32); 16]) -> f32 {
366    // gather ring samples into i32
367    let mut s = [0i32; 16];
368    for k in 0..16 {
369        let (dx, dy) = ring[k];
370        let xx = (x + dx) as usize;
371        let yy = (y + dy) as usize;
372        s[k] = img[yy * w + xx] as i32;
373    }
374
375    // SR
376    let mut sr = 0i32;
377    for k in 0..4 {
378        let a = s[k] + s[k + 8];
379        let b = s[k + 4] + s[k + 12];
380        sr += (a - b).abs();
381    }
382
383    // DR
384    let mut dr = 0i32;
385    for k in 0..8 {
386        dr += (s[k] - s[k + 8]).abs();
387    }
388
389    // neighbor mean
390    let sum_ring: i32 = s.iter().sum();
391    let mu_n = sum_ring as f32 / 16.0;
392
393    // local mean (5 px cross)
394    let c = img[(y as usize) * w + (x as usize)] as f32;
395    let n = img[((y - 1) as usize) * w + (x as usize)] as f32;
396    let s0 = img[((y + 1) as usize) * w + (x as usize)] as f32;
397    let e = img[(y as usize) * w + ((x + 1) as usize)] as f32;
398    let w0 = img[(y as usize) * w + ((x - 1) as usize)] as f32;
399    let mu_l = (c + n + s0 + e + w0) / 5.0;
400
401    let mr = (mu_n - mu_l).abs();
402
403    (sr as f32) - (dr as f32) - 16.0 * mr
404}
405
406fn compute_row_range_scalar(
407    img: &[u8],
408    w: usize,
409    y: i32,
410    ring: &[(i32, i32); 16],
411    dst_row: &mut [f32],
412    x_start: usize,
413    x_end: usize,
414) {
415    for (offset, x) in (x_start..x_end).enumerate() {
416        dst_row[offset] = chess_response_at_u8(img, w, x as i32, y, ring);
417    }
418}
419
420#[cfg(feature = "simd")]
421fn compute_row_range_simd(
422    img: &[u8],
423    w: usize,
424    y: i32,
425    ring: &[(i32, i32); 16],
426    dst_row: &mut [f32],
427    x_start: usize,
428    x_end: usize,
429) {
430    let y_usize = y as usize;
431
432    // Precompute row bases for each ring sample to avoid recomputing (y+dy)*w.
433    let mut ring_bases: [isize; 16] = [0; 16];
434    for k in 0..16 {
435        let (dx, dy) = ring[k];
436        let yy = (y + dy) as usize;
437        ring_bases[k] = (yy * w) as isize + dx as isize;
438    }
439
440    let mut x = x_start;
441
442    while x + LANES <= x_end {
443        // Gather ring samples for LANES pixels starting at x
444        let mut s: [I16s; 16] = [I16s::splat(0); 16];
445
446        for k in 0..16 {
447            let base = (ring_bases[k] + x as isize) as usize;
448
449            // SAFETY: x range + radius guarantees we stay in bounds
450            let v_u8 = U8s::from_slice(&img[base..base + LANES]);
451            s[k] = v_u8.cast::<i16>();
452        }
453
454        // Sum of ring values (for neighbor mean)
455        let mut sum_ring_v = I32s::splat(0);
456        for &v in &s {
457            sum_ring_v += v.cast::<i32>();
458        }
459
460        // SR
461        let mut sr_v = I32s::splat(0);
462        for k in 0..4 {
463            let a = s[k].cast::<i32>() + s[k + 8].cast::<i32>();
464            let b = s[k + 4].cast::<i32>() + s[k + 12].cast::<i32>();
465            sr_v += (a - b).abs();
466        }
467
468        // DR
469        let mut dr_v = I32s::splat(0);
470        for k in 0..8 {
471            let a = s[k].cast::<i32>();
472            let b = s[k + 8].cast::<i32>();
473            dr_v += (a - b).abs();
474        }
475
476        // Convert vectors to scalar arrays for the MR step
477        let sr_arr = sr_v.cast::<f32>().to_array();
478        let dr_arr = dr_v.cast::<f32>().to_array();
479        let sum_ring_arr = sum_ring_v.cast::<f32>().to_array();
480
481        // Per-lane local mean + final response
482        for lane in 0..LANES {
483            let xx = x + lane;
484            let px = xx - x_start;
485
486            // center + 4-neighborhood (scalar) at base resolution
487            let c = img[y_usize * w + xx] as f32;
488            let n = img[(y_usize - 1) * w + xx] as f32;
489            let s0 = img[(y_usize + 1) * w + xx] as f32;
490            let e = img[y_usize * w + (xx + 1)] as f32;
491            let w0 = img[y_usize * w + (xx - 1)] as f32;
492
493            let mu_n = sum_ring_arr[lane] / 16.0;
494            let mu_l = (c + n + s0 + e + w0) / 5.0;
495            let mr = (mu_n - mu_l).abs();
496
497            dst_row[px] = sr_arr[lane] - dr_arr[lane] - 16.0 * mr;
498        }
499
500        x += LANES;
501    }
502
503    // Tail: scalar for remaining pixels
504    while x < x_end {
505        let px = x - x_start;
506        let resp = chess_response_at_u8(img, w, x as i32, y, ring);
507        dst_row[px] = resp;
508        x += 1;
509    }
510}
chess_corners_core/response.rs

chess_corners_core/
response.rs