From 07711619d5be4b935fb6043488bfb601868e99a5 Mon Sep 17 00:00:00 2001 From: Bryan Kadzban Date: Sat, 10 Feb 2024 21:25:41 -0800 Subject: [PATCH 1/3] Use fixed-point u64 math to divide the PLL values This avoids the need for almost 1K of code to implement division for u64 values (which the Cortex-M7 can't do in hardware). It can multiply u64 values in hardware, so instead of dividing by (integer) x, we find 1/x in fixed-point, multiply, and shift the fixed-point offset back out. Finding 1/x can be done in a u32 since our shift is small enough, and that division *can* be done in hardware on this CPU. Fixes #211 --- CHANGELOG.md | 1 + src/rcc.rs | 98 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3923aeb..a81a682 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - Added a "low pin count" variant of the f730 chip to the crate features: packages <144 pins don't include a high speed USB PHY - Added SPI2_SCK pin for stm32f769i-discovery - Fix mass-erase triggering in `flash` on smaller chips +- Remove the need for software u64 division in the clock setup code, shrinking code (#211) ## [v0.7.0] - 2022-06-05 diff --git a/src/rcc.rs b/src/rcc.rs index b7c5cf2..5318f86 100644 --- a/src/rcc.rs +++ b/src/rcc.rs @@ -558,6 +558,33 @@ impl CFGR { self } + // We want to avoid dividing u64 values, because the Cortex-M7 CPU doesn't + // have hardware instructions for that, and the software divide that LLVM + // gives us is a relatively large amount of code. + // + // To do this, we operate in a fixed-point domain, and do a multiply by 1/x + // instead of dividing by x. We can calculate those 1/x values in a u32, if + // the fixed-point decimal place is chosen to be close enough to the LSB. + // + // But we also need to be able to represent the largest numerator, so we + // need enough bits to the left of the virtual decimal point. + // + // All of the chunks of code that do this are structured like: + // + // base_clk * n / m / p + // + // and they all have the same base_clk and n ranges (n up to 432, base_clk + // up to 50MHz). So base*plln can be as high as 216_000_000_000, and to + // represent that we need 38 bits. + // + // (We could use just 37 bits in one of these cases, if we take into account + // that high values of base_clk preclude using high values of n. But the + // other case is checking the output, so we can't assume anything about the + // inputs there.) + // + // So use 26 bits on the right of the decimal place. + const FIXED_POINT_SHIFT: u32 = 26; + /// Output clock calculation fn calculate_clocks(&self) -> (Clocks, InternalRCCConfig) { let mut config = InternalRCCConfig::default(); @@ -575,14 +602,20 @@ impl CFGR { let mut pll48clk_valid = false; if self.use_pll { - sysclk = base_clk as u64 * self.plln as u64 - / self.pllm as u64 - / match self.pllp { - PLLP::Div2 => 2, - PLLP::Div4 => 4, - PLLP::Div6 => 6, - PLLP::Div8 => 8, - }; + // These initial divisions have to operate on u32 values to avoid + // the software division. Fortunately our 26 bit choice for the + // decimal place, and the fact that these are 1/N, means we can + // fit them into 26 bits, so a u32 is fine. + let one_over_m = (1< 2u32, + PLLP::Div4 => 4u32, + PLLP::Div6 => 6u32, + PLLP::Div8 => 8u32, + }; + sysclk = + (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; } // Check if pll48clk is valid @@ -590,23 +623,28 @@ impl CFGR { match pll48clk { PLL48CLK::Pllq => { pll48clk_valid = { - let pll48clk = base_clk as u64 * self.plln as u64 - / self.pllm as u64 - / self.pllq as u64; + let one_over_m = (1<> Self::FIXED_POINT_SHIFT) + * one_over_q as u64) >> Self::FIXED_POINT_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } } PLL48CLK::Pllsai => { pll48clk_valid = { if self.use_pllsai { - let pll48clk = base_clk as u64 * self.pllsain as u64 - / self.pllm as u64 - / match self.pllsaip { - PLLSAIP::Div2 => 2, - PLLSAIP::Div4 => 4, - PLLSAIP::Div6 => 6, - PLLSAIP::Div8 => 8, - }; + // base_clk * pllsain has the same range as above + let one_over_m = (1< 2u32, + PLLSAIP::Div4 => 4u32, + PLLSAIP::Div6 => 6u32, + PLLSAIP::Div8 => 8u32, + }; + let pll48clk = (((base_clk as u64 * self.pllsain as u64 + * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } else { false @@ -801,7 +839,10 @@ impl CFGR { n = 432; continue; } - let f_vco_clock = (f_pll_clock_input as u64 * n as u64 / m as u64) as u32; + // See the comments around Self::FIXED_POINT_SHIFT to see how this works. + let one_over_m = (1<> Self::FIXED_POINT_SHIFT) as u32; if f_vco_clock < 50_000_000 { m += 1; n = 432; @@ -885,15 +926,16 @@ impl CFGR { // We check if (pllm, plln, pllp) allow to obtain the requested Sysclk, // so that we don't have to calculate them + let one_over_m = (1< 2u32, + PLLP::Div4 => 4u32, + PLLP::Div6 => 6u32, + PLLP::Div8 => 8u32, + }; let p_ok = (sysclk as u64) - == (base_clk as u64 * self.plln as u64 - / self.pllm as u64 - / match self.pllp { - PLLP::Div2 => 2, - PLLP::Div4 => 4, - PLLP::Div6 => 6, - PLLP::Div8 => 8, - }); + == (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; if p_ok && q.is_none() { return; } From 1d64824d38a3f80779fe3e302d46a6df4cbb1677 Mon Sep 17 00:00:00 2001 From: Bryan Kadzban Date: Sun, 11 Feb 2024 11:52:09 -0800 Subject: [PATCH 2/3] Round 1/x, and use 30-bit fixed point for them. This requires shifting the input frequencies down by 4 bits in order to fit both the max base_clk*n value, and the 30-bit fractional part, into a u64. (base_clk*n is up to 38 bits wide.) This is OK as long as all input frequencies are a multiple of 250kHz, which has the bottom 4 bits clear. --- src/rcc.rs | 70 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/src/rcc.rs b/src/rcc.rs index 5318f86..2a14092 100644 --- a/src/rcc.rs +++ b/src/rcc.rs @@ -583,7 +583,17 @@ impl CFGR { // inputs there.) // // So use 26 bits on the right of the decimal place. - const FIXED_POINT_SHIFT: u32 = 26; + // + // Also note, we need to round the 1/x values, not truncate them. So we + // shift left by one more bit, add one, and shift right by one. + const FIXED_POINT_LSHIFT: u32 = 31; + const FIXED_POINT_RSHIFT: u32 = 30; + + // We also drop 4 bits from the base_clk so that it and the fractional part + // (above) can fit into 64 bits. The max base_clk*n value needs 38 bits; + // shifting this out means it can fit into 34, with 30 (above) for the + // fractions. + const BASE_CLK_SHIFT: u32 = 4; /// Output clock calculation fn calculate_clocks(&self) -> (Clocks, InternalRCCConfig) { @@ -595,9 +605,9 @@ impl CFGR { None => HSI_FREQUENCY, } .raw(), - ); + ) >> Self::BASE_CLK_SHIFT; - let mut sysclk = base_clk; + let mut sysclk = base_clk << Self::BASE_CLK_SHIFT; let mut pll48clk_valid = false; @@ -606,16 +616,16 @@ impl CFGR { // the software division. Fortunately our 26 bit choice for the // decimal place, and the fact that these are 1/N, means we can // fit them into 26 bits, so a u32 is fine. - let one_over_m = (1<> 1; + let one_over_p = ((1< 2u32, PLLP::Div4 => 4u32, PLLP::Div6 => 6u32, PLLP::Div8 => 8u32, - }; + } + 1) >> 1; sysclk = - (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; + (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; } // Check if pll48clk is valid @@ -623,11 +633,11 @@ impl CFGR { match pll48clk { PLL48CLK::Pllq => { pll48clk_valid = { - let one_over_m = (1<> 1; + let one_over_q = ((1<> 1; let pll48clk = (((base_clk as u64 * self.plln as u64 - * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) - * one_over_q as u64) >> Self::FIXED_POINT_SHIFT; + * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) + * one_over_q as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } } @@ -635,16 +645,16 @@ impl CFGR { pll48clk_valid = { if self.use_pllsai { // base_clk * pllsain has the same range as above - let one_over_m = (1<> 1; + let one_over_p = ((1< 2u32, PLLSAIP::Div4 => 4u32, PLLSAIP::Div6 => 6u32, PLLSAIP::Div8 => 8u32, - }; + } + 1) >> 1; let pll48clk = (((base_clk as u64 * self.pllsain as u64 - * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; + * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } else { false @@ -839,10 +849,10 @@ impl CFGR { n = 432; continue; } - // See the comments around Self::FIXED_POINT_SHIFT to see how this works. - let one_over_m = (1<> Self::FIXED_POINT_SHIFT) as u32; + // See the comments around Self::FIXED_POINT_LSHIFT to see how this works. + let one_over_m = ((1<> 1; + let f_vco_clock = (((f_pll_clock_input as u64 >> Self::BASE_CLK_SHIFT) * n as u64 + * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT) as u32; if f_vco_clock < 50_000_000 { m += 1; n = 432; @@ -898,15 +908,15 @@ impl CFGR { Some(hse) => hse.freq, None => HSI_FREQUENCY, } - .raw(); + .raw() >> Self::BASE_CLK_SHIFT; let sysclk = if let Some(clk) = self.sysclk { clk } else { - base_clk + base_clk << Self::BASE_CLK_SHIFT }; - let p = if base_clk == sysclk { + let p = if base_clk << Self::BASE_CLK_SHIFT == sysclk { None } else { Some((sysclk - 1, sysclk + 1)) @@ -926,21 +936,21 @@ impl CFGR { // We check if (pllm, plln, pllp) allow to obtain the requested Sysclk, // so that we don't have to calculate them - let one_over_m = (1<> 1; + let one_over_p = ((1< 2u32, PLLP::Div4 => 4u32, PLLP::Div6 => 6u32, PLLP::Div8 => 8u32, - }; + } + 1) >> 1; let p_ok = (sysclk as u64) - == (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_SHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_SHIFT; + == (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; if p_ok && q.is_none() { return; } - if let Some((m, n, p, q)) = CFGR::calculate_mnpq(base_clk, FreqRequest { p, q }) { + if let Some((m, n, p, q)) = CFGR::calculate_mnpq(base_clk << Self::BASE_CLK_SHIFT, FreqRequest { p, q }) { self.pllm = m as u8; self.plln = n as u16; if let Some(p) = p { From e1ce02993207b27f2a91b46f21e038ee5b2bd67e Mon Sep 17 00:00:00 2001 From: Bryan Kadzban Date: Sun, 11 Feb 2024 12:03:02 -0800 Subject: [PATCH 3/3] Run rustfmt on rcc.rs "cargo fmt" does not work because it requires both test mode and the #![no_std] attribute, which the crate lib.rs doesn't enable in test mode. But manually running rustfmt does. --- src/rcc.rs | 106 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 39 deletions(-) diff --git a/src/rcc.rs b/src/rcc.rs index 2a14092..c765101 100644 --- a/src/rcc.rs +++ b/src/rcc.rs @@ -616,16 +616,21 @@ impl CFGR { // the software division. Fortunately our 26 bit choice for the // decimal place, and the fact that these are 1/N, means we can // fit them into 26 bits, so a u32 is fine. - let one_over_m = ((1<> 1; - let one_over_p = ((1< 2u32, - PLLP::Div4 => 4u32, - PLLP::Div6 => 6u32, - PLLP::Div8 => 8u32, - } + 1) >> 1; - sysclk = - (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; + let one_over_m = ((1 << Self::FIXED_POINT_LSHIFT) / (self.pllm as u32) + 1) >> 1; + let one_over_p = ((1 << Self::FIXED_POINT_LSHIFT) + / match self.pllp { + PLLP::Div2 => 2u32, + PLLP::Div4 => 4u32, + PLLP::Div6 => 6u32, + PLLP::Div8 => 8u32, + } + + 1) + >> 1; + sysclk = (((base_clk as u64 * self.plln as u64 * one_over_m as u64) + >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) + >> Self::FIXED_POINT_RSHIFT + << Self::BASE_CLK_SHIFT; } // Check if pll48clk is valid @@ -633,11 +638,15 @@ impl CFGR { match pll48clk { PLL48CLK::Pllq => { pll48clk_valid = { - let one_over_m = ((1<> 1; - let one_over_q = ((1<> 1; - let pll48clk = (((base_clk as u64 * self.plln as u64 - * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) - * one_over_q as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; + let one_over_m = + ((1 << Self::FIXED_POINT_LSHIFT) / (self.pllm as u32) + 1) >> 1; + let one_over_q = + ((1 << Self::FIXED_POINT_LSHIFT) / (self.pllq as u32) + 1) >> 1; + let pll48clk = (((base_clk as u64 * self.plln as u64 * one_over_m as u64) + >> Self::FIXED_POINT_RSHIFT) + * one_over_q as u64) + >> Self::FIXED_POINT_RSHIFT + << Self::BASE_CLK_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } } @@ -645,16 +654,23 @@ impl CFGR { pll48clk_valid = { if self.use_pllsai { // base_clk * pllsain has the same range as above - let one_over_m = ((1<> 1; - let one_over_p = ((1< 2u32, - PLLSAIP::Div4 => 4u32, - PLLSAIP::Div6 => 6u32, - PLLSAIP::Div8 => 8u32, - } + 1) >> 1; - let pll48clk = (((base_clk as u64 * self.pllsain as u64 - * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; + let one_over_m = + ((1 << Self::FIXED_POINT_LSHIFT) / (self.pllm as u32) + 1) >> 1; + let one_over_p = ((1 << Self::FIXED_POINT_LSHIFT) + / match self.pllsaip { + PLLSAIP::Div2 => 2u32, + PLLSAIP::Div4 => 4u32, + PLLSAIP::Div6 => 6u32, + PLLSAIP::Div8 => 8u32, + } + + 1) + >> 1; + let pll48clk = + (((base_clk as u64 * self.pllsain as u64 * one_over_m as u64) + >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) + >> Self::FIXED_POINT_RSHIFT + << Self::BASE_CLK_SHIFT; (48_000_000 - 120_000..=48_000_000 + 120_000).contains(&pll48clk) } else { false @@ -850,9 +866,12 @@ impl CFGR { continue; } // See the comments around Self::FIXED_POINT_LSHIFT to see how this works. - let one_over_m = ((1<> 1; - let f_vco_clock = (((f_pll_clock_input as u64 >> Self::BASE_CLK_SHIFT) * n as u64 - * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT) as u32; + let one_over_m = ((1 << Self::FIXED_POINT_LSHIFT) / (m as u32) + 1) >> 1; + let f_vco_clock = (((f_pll_clock_input as u64 >> Self::BASE_CLK_SHIFT) + * n as u64 + * one_over_m as u64) + >> Self::FIXED_POINT_RSHIFT + << Self::BASE_CLK_SHIFT) as u32; if f_vco_clock < 50_000_000 { m += 1; n = 432; @@ -908,7 +927,8 @@ impl CFGR { Some(hse) => hse.freq, None => HSI_FREQUENCY, } - .raw() >> Self::BASE_CLK_SHIFT; + .raw() + >> Self::BASE_CLK_SHIFT; let sysclk = if let Some(clk) = self.sysclk { clk @@ -936,21 +956,29 @@ impl CFGR { // We check if (pllm, plln, pllp) allow to obtain the requested Sysclk, // so that we don't have to calculate them - let one_over_m = ((1<> 1; - let one_over_p = ((1< 2u32, - PLLP::Div4 => 4u32, - PLLP::Div6 => 6u32, - PLLP::Div8 => 8u32, - } + 1) >> 1; + let one_over_m = ((1 << Self::FIXED_POINT_LSHIFT) / (self.pllm as u32) + 1) >> 1; + let one_over_p = ((1 << Self::FIXED_POINT_LSHIFT) + / match self.pllp { + PLLP::Div2 => 2u32, + PLLP::Div4 => 4u32, + PLLP::Div6 => 6u32, + PLLP::Div8 => 8u32, + } + + 1) + >> 1; let p_ok = (sysclk as u64) - == (((base_clk as u64 * self.plln as u64 * one_over_m as u64) >> Self::FIXED_POINT_RSHIFT) - * one_over_p as u64) >> Self::FIXED_POINT_RSHIFT << Self::BASE_CLK_SHIFT; + == (((base_clk as u64 * self.plln as u64 * one_over_m as u64) + >> Self::FIXED_POINT_RSHIFT) + * one_over_p as u64) + >> Self::FIXED_POINT_RSHIFT + << Self::BASE_CLK_SHIFT; if p_ok && q.is_none() { return; } - if let Some((m, n, p, q)) = CFGR::calculate_mnpq(base_clk << Self::BASE_CLK_SHIFT, FreqRequest { p, q }) { + if let Some((m, n, p, q)) = + CFGR::calculate_mnpq(base_clk << Self::BASE_CLK_SHIFT, FreqRequest { p, q }) + { self.pllm = m as u8; self.plln = n as u16; if let Some(p) = p {