Skip to content

Commit

Permalink
update get_attributes_comb_statistics by assembling different stat types
Browse files Browse the repository at this point in the history
  • Loading branch information
xx01cyx committed Nov 15, 2024
1 parent 6eb725b commit f005602
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 17 deletions.
86 changes: 80 additions & 6 deletions optd-cost-model/src/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ use optd_persistent::{
CostModelStorageLayer,
};

use crate::{common::types::TableId, stats::AttributeCombValueStats, CostModelResult};
use crate::{
common::types::TableId,
stats::{counter::Counter, AttributeCombValueStats, Distribution, MostCommonValues},
CostModelResult,
};

/// TODO: documentation
pub struct CostModelStorageManager<S: CostModelStorageLayer> {
Expand All @@ -19,7 +23,8 @@ impl<S: CostModelStorageLayer> CostModelStorageManager<S> {
Self { backend_manager }
}

/// TODO: documentation
/// Gets the attribute information for a given table and attribute base index.
///
/// TODO: if we have memory cache,
/// we should add the reference. (&Field)
pub async fn get_attribute_info(
Expand All @@ -33,23 +38,92 @@ impl<S: CostModelStorageLayer> CostModelStorageManager<S> {
.await?)
}

/// TODO: documentation
/// Gets the latest statistics for a given table.
///
/// Currently, in `AttributeCombValueStats`, only `Distribution` is optional.
/// This poses a question about the behavior of the system if there is no corresponding
/// `MostCommonValues`, `ndistinct`, or other statistics. We should have a clear
/// specification about the behavior of the system in the presence of missing statistics.
///
/// TODO: if we have memory cache,
/// we should add the reference. (&AttributeCombValueStats)
///
/// TODO: Shall we pass in an epoch here to make sure that the statistics are from the same
/// epoch?
pub async fn get_attributes_comb_statistics(
&self,
table_id: TableId,
attr_base_indices: &[i32],
) -> CostModelResult<Option<AttributeCombValueStats>> {
Ok(self
let dist: Option<Distribution> = self
.backend_manager
.get_stats_for_attr_indices_based(
table_id.into(),
attr_base_indices.to_vec(),
StatType::Distribution,
None,
)
.await?
.map(|json| serde_json::from_value(json).unwrap());

let mcvs = self
.backend_manager
.get_stats_for_attr_indices_based(
table_id.into(),
attr_base_indices.to_vec(),
StatType::MostCommonValues,
None,
)
.await?
.map(|json| serde_json::from_value(json).unwrap())
.unwrap_or_else(|| MostCommonValues::Counter(Counter::default()));

let ndistinct = self
.backend_manager
.get_stats_for_attr_indices_based(
table_id.into(),
attr_base_indices.to_vec(),
StatType::Comb,
StatType::Cardinality,
None,
)
.await?
.map(|json| json.into()))
.map(|json| serde_json::from_value(json).unwrap())
.unwrap_or(0);

let table_row_count = self
.backend_manager
.get_stats_for_attr_indices_based(
table_id.into(),
attr_base_indices.to_vec(),
StatType::TableRowCount,
None,
)
.await?
.map(|json| serde_json::from_value(json).unwrap())
.unwrap_or(0);
let non_null_count = self
.backend_manager
.get_stats_for_attr_indices_based(
table_id.into(),
attr_base_indices.to_vec(),
StatType::NonNullCount,
None,
)
.await?
.map(|json| serde_json::from_value(json).unwrap())
.unwrap_or(0);

// FIXME: Only minimal checks for invalid values is conducted here. We should have
// much clear specification about the behavior of the system in the presence of
// invalid statistics.
let null_frac = if table_row_count == 0 {
0.0
} else {
1.0 - (non_null_count as f64 / table_row_count as f64)
};

Ok(Some(AttributeCombValueStats::new(
mcvs, ndistinct, null_frac, dist,
)))
}
}
4 changes: 2 additions & 2 deletions optd-persistent/src/cost_model/catalog/mock_catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ impl MockCatalog {
let statistics: Vec<MockStatistic> = vec![
MockStatistic {
id: 1,
stat_type: StatType::NotNullCount as i32,
stat_type: StatType::NonNullCount as i32,
stat_value: json!(100),
attr_ids: vec![1],
table_id: None,
name: "CountAttr1".to_string(),
},
MockStatistic {
id: 2,
stat_type: StatType::NotNullCount as i32,
stat_type: StatType::NonNullCount as i32,
stat_value: json!(200),
attr_ids: vec![2],
table_id: None,
Expand Down
14 changes: 10 additions & 4 deletions optd-persistent/src/cost_model/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,20 @@ pub enum ConstraintType {
/// TODO: documentation
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum StatType {
/// A combination of multiple statistics, e.g. most common values, distribution.
Comb,
/// `TableRowCount` only applies to table statistics.
/// The row count in a table. `TableRowCount` only applies to table statistics.
TableRowCount,
NotNullCount,
/// The number of non-null values in a column.
NonNullCount,
/// The number of distinct values in a column.
Cardinality,
/// The minimum value in a column.
Min,
/// The maximum value in a column.
Max,
/// The frequency of each value in a column.
MostCommonValues,
/// The distribution of values in a column.
Distribution,
}

/// TODO: documentation
Expand Down
10 changes: 5 additions & 5 deletions optd-persistent/src/cost_model/orm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@ mod tests {
assert!(stat_res.is_ok());
assert_eq!(stat_res.unwrap().unwrap(), json!(300));
let stat_res = backend_manager
.get_stats_for_attr([2].to_vec(), StatType::NotNullCount, None)
.get_stats_for_attr([2].to_vec(), StatType::NonNullCount, None)
.await;
assert!(stat_res.is_ok());
assert_eq!(stat_res.unwrap().unwrap(), json!(200));
Expand All @@ -679,7 +679,7 @@ mod tests {
.await
.unwrap();
let stat = Stat {
stat_type: StatType::NotNullCount,
stat_type: StatType::NonNullCount,
stat_value: json!(100),
attr_ids: vec![1],
table_id: None,
Expand All @@ -698,7 +698,7 @@ mod tests {
println!("{:?}", stat_res);
assert_eq!(stat_res[0].number_of_attributes, 1);
assert_eq!(stat_res[0].description, "1".to_string());
assert_eq!(stat_res[0].variant_tag, StatType::NotNullCount as i32);
assert_eq!(stat_res[0].variant_tag, StatType::NonNullCount as i32);
let stat_attr_res = StatisticToAttributeJunction::find()
.filter(statistic_to_attribute_junction::Column::StatisticId.eq(stat_res[0].id))
.all(&backend_manager.db)
Expand Down Expand Up @@ -761,7 +761,7 @@ mod tests {
.await
.unwrap();
let stat2 = Stat {
stat_type: StatType::NotNullCount,
stat_type: StatType::NonNullCount,
stat_value: json!(200),
attr_ids: vec![1],
table_id: None,
Expand Down Expand Up @@ -815,7 +815,7 @@ mod tests {
// 3. Update existed stat with the same value
let epoch_num = Event::find().all(&backend_manager.db).await.unwrap().len();
let stat3 = Stat {
stat_type: StatType::NotNullCount,
stat_type: StatType::NonNullCount,
stat_value: json!(200),
attr_ids: vec![1],
table_id: None,
Expand Down

0 comments on commit f005602

Please sign in to comment.