From 64868149f8e989e1e6323889a712ed28d77a1632 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 12:44:09 -0400 Subject: [PATCH 1/9] updated LogNormalize.V3Matrix to include median scale factor functionality --- R/preprocessing.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/preprocessing.R b/R/preprocessing.R index 1de879390..a98890ea2 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -4336,6 +4336,12 @@ LogNormalize.V3Matrix <- function( if (verbose) { cat("Performing log-normalization\n", file = stderr()) } + + #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" + if (is.character(scale.factor) && scale.factor == "median") { + scale.factor <- median(Matrix::colSums(data)) + } + norm.data <- LogNorm(data, scale_factor = scale.factor, display_progress = verbose) colnames(x = norm.data) <- colnames(x = data) rownames(x = norm.data) <- rownames(x = data) From d21b00010fab6175b7626737822fc8fc8865cac2 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 16:45:33 -0400 Subject: [PATCH 2/9] updated RelativeCounts to include median scale factor functionality --- R/preprocessing.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/preprocessing.R b/R/preprocessing.R index a98890ea2..3450d56ca 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3120,6 +3120,12 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) { if (verbose) { cat("Performing relative-counts-normalization\n", file = stderr()) } + + #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" + if (is.character(scale.factor) && scale.factor == "median") { + scale.factor <- median(Matrix::colSums(data)) + } + norm.data <- data norm.data@x <- norm.data@x / rep.int(Matrix::colSums(norm.data), diff(norm.data@p)) * scale.factor return(norm.data) From 959874c1302d7c553ed98d15f43a12de3ce6f848 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 19:37:30 -0400 Subject: [PATCH 3/9] added median scale factor functionality to LogNormalize.default --- R/preprocessing5.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/preprocessing5.R b/R/preprocessing5.R index a767ab24b..a48b495ab 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -256,6 +256,17 @@ LogNormalize.default <- function( if (isTRUE(x = verbose)) { pb <- txtProgressBar(file = stderr(), style = 3) } + + #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" + if (is.character(scale.factor) && scale.factor == "median") { + sums <- if (margin == 1L) { + rowSums(data) # Sum of each row (gene) if margin is 1L + } else { + colSums(data) # Sum of each column (cell) if margin is 2L + } + scale.factor = median(sums) + } + for (i in seq_len(length.out = ncells)) { x <- if (margin == 1L) { data[i, ] From 023fe144853d71096ddc5909ef4fa291a75a4a80 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 19:47:28 -0400 Subject: [PATCH 4/9] added median scale factor functionality to LogNormalize.IterableMatrix --- R/preprocessing5.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/preprocessing5.R b/R/preprocessing5.R index a48b495ab..49cb9f76a 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -299,6 +299,12 @@ LogNormalize.IterableMatrix <- function( verbose = TRUE, ... ) { + + #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" + if (is.character(scale.factor) && scale.factor == "median") { + scale.factor <- median(colSums(data)) + } + data <- BPCells::t(BPCells::t(data) / colSums(data)) # Log normalization data <- log1p(data * scale.factor) From 41fab47282617b80c739fbba76803dc1f3e4ce5f Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 20:13:32 -0400 Subject: [PATCH 5/9] added median scale factor functionality to .SparseNormalize --- R/preprocessing5.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/R/preprocessing5.R b/R/preprocessing5.R index 49cb9f76a..7eb68a2b6 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -877,6 +877,32 @@ DISP <- function( p <- p + 1L } np <- length(x = p) - 1L + + if (is.character(scale.factor) && scale.factor == "median" && isTRUE(x = verbose)) { + cat("Calculating column sums for median scale factor\n", file = stderr()) + pb_median <- txtProgressBar(style = 3L, file = stderr()) + } + + #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" + if (is.character(scale.factor) && scale.factor == "median") { + col_sums <- numeric(np) + for (i in seq_len(length.out = np)) { + idx <- seq.int(from = p[i], to = p[i + 1] - 1L) + xidx <- slot(object = data, name = entryname)[idx] + col_sums[i] <- sum(xidx) + + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb_median, value = i / np) + } + } + + if (isTRUE(x = verbose)) { + close(pb_median) + } + + scale.factor <- median(col_sums) + } + if (isTRUE(x = verbose)) { pb <- txtProgressBar(style = 3L, file = stderr()) } From e2cd31ac3cf5ac80d4fbb8dcde51879e8e0b2c18 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 20:35:55 -0400 Subject: [PATCH 6/9] modified documentation for updates to SparseNormalize --- R/preprocessing5.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/preprocessing5.R b/R/preprocessing5.R index 7eb68a2b6..984f5f87a 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -878,6 +878,7 @@ DISP <- function( } np <- length(x = p) - 1L + #adding a progress bar for median calculation is verbose is TRUE if (is.character(scale.factor) && scale.factor == "median" && isTRUE(x = verbose)) { cat("Calculating column sums for median scale factor\n", file = stderr()) pb_median <- txtProgressBar(style = 3L, file = stderr()) From 9fa3c180bc72a1d54448c9841fe2826e92a8ab48 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 20:46:32 -0400 Subject: [PATCH 7/9] added display message for median calculation is verbose --- R/preprocessing.R | 6 ++++++ R/preprocessing5.R | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/R/preprocessing.R b/R/preprocessing.R index 3450d56ca..5d102f482 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3123,6 +3123,9 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) { #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { + if(verbose){ + cat("Calculating column sums for median scale factor\n", file = stderr()) + } scale.factor <- median(Matrix::colSums(data)) } @@ -4345,6 +4348,9 @@ LogNormalize.V3Matrix <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { + if(verbose){ + cat("Calculating column sums for median scale factor\n", file = stderr()) + } scale.factor <- median(Matrix::colSums(data)) } diff --git a/R/preprocessing5.R b/R/preprocessing5.R index 984f5f87a..ee1b6d0e3 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -259,6 +259,9 @@ LogNormalize.default <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { + if(verbose){ + cat("Calculating column sums for median scale factor\n", file = stderr()) + } sums <- if (margin == 1L) { rowSums(data) # Sum of each row (gene) if margin is 1L } else { @@ -302,6 +305,9 @@ LogNormalize.IterableMatrix <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { + if(verbose){ + cat("Calculating column sums for median scale factor\n", file = stderr()) + } scale.factor <- median(colSums(data)) } From 5fceb0626e07277b0486a415d2249660970e8452 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 21:06:59 -0400 Subject: [PATCH 8/9] modified comments for scale factor calculation if verbose --- R/preprocessing.R | 4 ++-- R/preprocessing5.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 5d102f482..069a3b002 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3124,7 +3124,7 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) { #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { if(verbose){ - cat("Calculating column sums for median scale factor\n", file = stderr()) + cat("Calculating median scale factor\n", file = stderr()) } scale.factor <- median(Matrix::colSums(data)) } @@ -4349,7 +4349,7 @@ LogNormalize.V3Matrix <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { if(verbose){ - cat("Calculating column sums for median scale factor\n", file = stderr()) + cat("Calculating median scale factor\n", file = stderr()) } scale.factor <- median(Matrix::colSums(data)) } diff --git a/R/preprocessing5.R b/R/preprocessing5.R index ee1b6d0e3..f50f08037 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -260,7 +260,7 @@ LogNormalize.default <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { if(verbose){ - cat("Calculating column sums for median scale factor\n", file = stderr()) + cat("Calculating median scale factor\n", file = stderr()) } sums <- if (margin == 1L) { rowSums(data) # Sum of each row (gene) if margin is 1L @@ -306,7 +306,7 @@ LogNormalize.IterableMatrix <- function( #setting scale.factor to be the median of counts across all columns if scale.factor is the string "median" if (is.character(scale.factor) && scale.factor == "median") { if(verbose){ - cat("Calculating column sums for median scale factor\n", file = stderr()) + cat("Calculating median scale factor\n", file = stderr()) } scale.factor <- median(colSums(data)) } @@ -886,7 +886,7 @@ DISP <- function( #adding a progress bar for median calculation is verbose is TRUE if (is.character(scale.factor) && scale.factor == "median" && isTRUE(x = verbose)) { - cat("Calculating column sums for median scale factor\n", file = stderr()) + cat("Calculating median scale factor\n", file = stderr()) pb_median <- txtProgressBar(style = 3L, file = stderr()) } From 50692a74c4925dd4938077008db278bdf8510407 Mon Sep 17 00:00:00 2001 From: pranavm2109 Date: Thu, 10 Oct 2024 22:57:27 -0400 Subject: [PATCH 9/9] tested all modified function via unti testing in test_preprocessing.R --- tests/testthat/test_preprocessing.R | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/testthat/test_preprocessing.R b/tests/testthat/test_preprocessing.R index a28af6ddd..a5db73b06 100644 --- a/tests/testthat/test_preprocessing.R +++ b/tests/testthat/test_preprocessing.R @@ -97,6 +97,21 @@ test_that("Relative count normalization returns expected values", { expect_equal(rc.counts[2, 1], 14285.71, tolerance = 1e-6) }) +denseMatrix <- as.matrix(pbmc.test) # Matrix to test LogNormalize.V3Matrix and RelativeCounts methods +test_that("LogNormalize.V3Matrix computes median scale factor correctly", { + expectedMedian <- median(colSums(denseMatrix)) + resultFromExpectedMedian <- LogNormalize.V3Matrix(data = denseMatrix, scale.factor = expectedMedian, margin = 2L, verbose = FALSE) + resultFromScaleFactorSetToMedian <- LogNormalize.V3Matrix(data = denseMatrix, scale.factor = "median", margin = 2L, verbose = FALSE) + expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6) +}) + +test_that("RelativeCounts computes median scale factor correctly", { + expectedMedian <- median(colSums(denseMatrix)) + resultFromExpectedMedian <- RelativeCounts(data = denseMatrix, scale.factor = expectedMedian, verbose = FALSE) + resultFromScaleFactorSetToMedian <- RelativeCounts(data = denseMatrix, scale.factor = "median", verbose = FALSE) + expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6) +}) + # Tests for v5 NormalizeData # -------------------------------------------------------------------------------- context("v5 NormalizeData") @@ -175,6 +190,44 @@ test_that("LogNormalize normalizes properly for BPCells", { ) }) +test_that("LogNormalize.IterableMatrix computes median scale factor correctly", { + skip_on_cran() + library(Matrix) + skip_if_not_installed("BPCells") + library(BPCells) + mat_bpcells <- t(as(t(object[['RNA']]$counts ), "IterableMatrix")) + expectedMedian <- median(colSums(mat_bpcells)) + resultFromExpectedMedian <- LogNormalize.IterableMatrix(data = mat_bpcells, scale.factor = expectedMedian, margin = 2L, verbose = FALSE) + resultFromScaleFactorSetToMedian <- LogNormalize.IterableMatrix(data = mat_bpcells, scale.factor = "median", margin = 2L, verbose = FALSE) + expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6) +}) + +denseMatrix <- as.matrix(pbmc.test) # Matrix to test LogNormalize.default when scale.factor is set to "median" +test_that("LogNormalize.default computes median scale factor correctly for both margin values", { + expectedMedianForMargin1L <- median(rowSums(denseMatrix)) + expectedMedianForMargin2L <- median(colSums(denseMatrix)) + + resultFromExpectedMedianForMargin1L <- LogNormalize.default(data = denseMatrix, scale.factor = expectedMedianForMargin1L, margin = 1L, verbose = FALSE) + resultFromExpectedMedianForMargin2L <- LogNormalize.default(data = denseMatrix, scale.factor = expectedMedianForMargin2L, margin = 2L, verbose = FALSE) + + resultsFromScaleFactorSetToMedianForMargin1L <- LogNormalize.default(data = denseMatrix, scale.factor = "median", margin = 1L, verbose = FALSE)#if the normalization is across rows (genes) + resultsFromScaleFactorSetToMedianForMargin2L <- LogNormalize.default(data = denseMatrix, scale.factor = "median", margin = 2L, verbose = FALSE)#if the normalization is across columns (cells) + + expect_equal(as.matrix(resultFromExpectedMedianForMargin1L), as.matrix(resultsFromScaleFactorSetToMedianForMargin1L), tolerance = 1e-6) + expect_equal(as.matrix(resultFromExpectedMedianForMargin2L), as.matrix(resultsFromScaleFactorSetToMedianForMargin2L), tolerance = 1e-6) +}) + +theSparseMatrix <- as.sparse(denseMatrix) # Sparse Matrix to test .SparseNormalize computes median scale factor correctly +test_that("LogNormalize.default computes median scale factor correctly for both margin values", { + expectedMedian <- median(colSums(theSparseMatrix)) + + resultFromExpectedMedian <- .SparseNormalize(data = theSparseMatrix, scale.factor = expectedMedian, verbose = FALSE) + resultsFromScaleFactorSetToMedian <- .SparseNormalize(data = theSparseMatrix, scale.factor = "median", verbose = FALSE) + + expect_equal(resultFromExpectedMedian, resultsFromScaleFactorSetToMedian, tolerance = 1e-6) +}) + + # Tests for ScaleData # -------------------------------------------------------------------------------- context("ScaleData")