Merge pull request #7 from gwaygenomics/master

gwaybio · web-flow · commit c88c9d2f6f42 · 2016-08-30T10:10:36.000-04:00
Checkpoint Package and Custom SAM subsetting
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-.DS_Store
+*.DS_Store
+Rplots.pdf
diff --git a/1.DataInclusion/Scripts/A.getInclusion.R b/1.DataInclusion/Scripts/A.getInclusion.R
@@ -7,6 +7,9 @@
 # This script will perform our inclusion criteria on all datasets included 
 # in curatedOvarianData and a dataset from the Mayo Clinic
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 ####################################
 # Load Libraries
 ####################################
diff --git a/1.DataInclusion/Scripts/B.getGenes.R b/1.DataInclusion/Scripts/B.getGenes.R
@@ -7,6 +7,9 @@
 # Output gene lists (common genes and MAD genes) and
 # Overlapping Genes Venn Diagram: Supplemental Figure 1
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly = T)
 #args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 ################################
@@ -119,4 +122,4 @@ venn.plot <- venn.diagram(x = list('TCGA' = TCGA_venn,
                           cat.pos = c(0, -20, 20, -20, 20),
                           cat.dist = c(.19, .21, -.18, -.2, .21),
                           margin = .1, cex = 0.8,
-                          main.cex = 2)
+                          main.cex = 2)
diff --git a/1.DataInclusion/Scripts/processMayoEset/Agilent1and2and3_COMBAT_datamerge.R b/1.DataInclusion/Scripts/processMayoEset/Agilent1and2and3_COMBAT_datamerge.R
@@ -18,6 +18,9 @@
 ##  Modified date : 19 September 2015 12:00 PM EST
 ######################################################## 
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint("2016-03-01", checkpointLocation = "."))
+
 ######################## RUN under R version 2.15.0 (2012-03-30)
 ######################## TESTED under R version 3.1.2 (2014-10-31)
 ###################################################################
@@ -171,4 +174,4 @@ sum(is.na(comb.mtx2))
 #[1] 29815
 
 # Write comb.mtx99 to file
-write.table(comb.mtx2, "1.DataInclusion/Data/Mayo/COMBATadj_withNAcy5cy3.tsv", sep = "\t")
+write.table(comb.mtx2, "1.DataInclusion/Data/Mayo/COMBATadj_withNAcy5cy3.tsv", sep = "\t")
diff --git a/1.DataInclusion/Scripts/processMayoEset/createMayoEset.R b/1.DataInclusion/Scripts/processMayoEset/createMayoEset.R
@@ -7,6 +7,9 @@
 # This script will take the normalized matrix from the Mayo Data and output 
 # an eset to be used in subsequent analyses
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 ##########################
 # Load Libraries
 ##########################
diff --git a/2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R b/2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
@@ -8,8 +8,11 @@
 # across k and across datasets, it will output cluster membership files and 
 # correlations
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
-#args <- c(2, 4, 20, 123, FALSE, TRUE, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
+#args <- c(2, 4, 20, 123, FALSE, FALSE, "madgenes", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 ############################################
 # Load Libraries
 ############################################
@@ -39,6 +42,7 @@ kStarts <- as.numeric(paste(args[3]))
 kSEED <- as.numeric(paste(args[4])) 
 bNMF <- as.logical(args[5])
 shuffle <- as.logical(args[6])
+SAM_subset <- args[7]
 
 # Separate the eset arguments from the rest of the commandArgs
 argsCurated <- args[grep("eset", args)]
@@ -55,7 +59,7 @@ if("Mayo" %in% args) {
 }
 
 # Use the LoadOVCA_Data function to read in the datasets subset by commongenes
-ExpData <- LoadOVCA_Data(datasets = argsCurated, genelist_subset = "commongenes", shuffle = shuffle)
+ExpData <- LoadOVCA_Data(datasets = argsCurated, genelist_subset = SAM_subset, shuffle = shuffle)
 
 # Read in common genes csv file. The csv was generated as an intersection of the genes in 
 # TCGA, Yoshihara, Mayo, Tothill, and Bonome
@@ -494,11 +498,13 @@ for (centroid in 1:length(Dlist.mapped)) {
     if (!bNMF) {
       write.table(WithinDatasetCor[[centroid]], file = 
                     paste("2.Clustering_DiffExprs/Tables/WithinCor/", names(WithinDatasetCor)[centroid],
-                          "WithinDatasetCorrelations.csv", sep = ""), row.names = T, col.names = NA, sep = ",")
+                          '_', SAM_subset,  "_WithinDatasetCorrelations.csv", sep = ""),
+                  row.names = T, col.names = NA, sep = ",")
     } else {
       write.table(WithinDatasetCor[[centroid]], file = 
                     paste("2.Clustering_DiffExprs/Figures/nmf/WithinCor/", names(WithinDatasetCor)[centroid],
-                          "nmf_WithinDatasetCorrelations.csv", sep = ""), row.names = T, col.names = NA, sep = ",")
+                          '_', SAM_subset, "nmf_WithinDatasetCorrelations.csv", sep = ""),
+                  row.names = T, col.names = NA, sep = ",")
     }
   }
 }
@@ -532,10 +538,12 @@ for (centroid in 1:length(Dlist.mapped.cor)) {
   if (!shuffle) {
     if (!bNMF) {
       write.table(tmpCor, file = paste("2.Clustering_DiffExprs/Tables/AcrossCor/AcrossDatasetCor_K",
-                                       krange[centroid], ".csv", sep = ""), sep = ",", row.names = T, col.names = NA)
+                                       krange[centroid], "_", SAM_subset, ".csv", sep = ""),
+                  sep = ",", row.names = T, col.names = NA)
     } else {
       write.table(tmpCor, file = paste("2.Clustering_DiffExprs/Figures/nmf/AcrossCor/AcrossDatasetCor_nmf_K",
-                                       krange[centroid], ".csv", sep = ""), sep = ",", row.names = T, col.names = NA)
+                                       krange[centroid], "_", SAM_subset, ".csv", sep = ""),
+                  sep = ",", row.names = T, col.names = NA)
     }
   }
 }
diff --git a/2.Clustering_DiffExprs/Scripts/B.CorrelationMatrix.R b/2.Clustering_DiffExprs/Scripts/B.CorrelationMatrix.R
@@ -7,6 +7,9 @@
 # This script will input a series of datasets and output sample by sample 
 # correlation matrix heatmaps
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 #args <- c(2, 4, 123, "Figures/CorrelationMatrix/", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 ###########################################
diff --git a/2.Clustering_DiffExprs/Scripts/C.KMeansBarCharts.R b/2.Clustering_DiffExprs/Scripts/C.KMeansBarCharts.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will input a series of datasets and output bar charts
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 # args <- c("2", "4", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 
diff --git a/2.Clustering_DiffExprs/Scripts/D.NMF.R b/2.Clustering_DiffExprs/Scripts/D.NMF.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will input a series of datasets and perform Non-negative Matrix Factorization (NMF)
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly = TRUE)
 # args <- c(2, 4, 100, 123, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 ################################
diff --git a/2.Clustering_DiffExprs/Scripts/E.kmeans_v_nmf.R b/2.Clustering_DiffExprs/Scripts/E.kmeans_v_nmf.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will compare clusters identified by k-means and NMF
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 # args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
 
diff --git a/2.Clustering_DiffExprs/Scripts/F.clusterMembership.R b/2.Clustering_DiffExprs/Scripts/F.clusterMembership.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will output a data frame of cluster membership for all clustering events
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 #args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
 
diff --git a/2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R b/2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R
@@ -9,6 +9,9 @@
 # identified in the 2008 Clin Cancer Research Paper, and the original Konecny 
 # subtypes identified in the 2014 JNCI paper
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 library(curatedOvarianData)
 
 ############################################
diff --git a/2.Clustering_DiffExprs/Scripts/H.TCGA_LMP_TothillPrediction.R b/2.Clustering_DiffExprs/Scripts/H.TCGA_LMP_TothillPrediction.R
@@ -8,6 +8,9 @@
 # except without removing LMP samples to contrast our biological filtering
 # findings
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 set.seed(123)
 
 ################################
diff --git a/2.Clustering_DiffExprs/Scripts/H.TCGA_SupFig6.2_TothillPrediction.R b/2.Clustering_DiffExprs/Scripts/H.TCGA_SupFig6.2_TothillPrediction.R
diff --git a/3.Fit/Scripts/A.GoodnessFit.R b/3.Fit/Scripts/A.GoodnessFit.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will output the AIC, BIC, and Silhouette Width plots
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 # args <- c(2, 8, 20, 20, 123, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
 ############################################
diff --git a/3.Fit/Scripts/B.GAP_GoodnessFit.R b/3.Fit/Scripts/B.GAP_GoodnessFit.R
diff --git a/4.Survival/Scripts/A.Survival.R b/4.Survival/Scripts/A.Survival.R
@@ -6,6 +6,9 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will output kaplan-meier curves and survival assessments for all input datasets
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 
 # args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
diff --git a/4.Survival/Scripts/B.Summarize_Survival.R b/4.Survival/Scripts/B.Summarize_Survival.R
@@ -5,8 +5,11 @@
 # Konecny, G., Goode, E., Greene, C.S., Doherty, J.A.
 # ~~~~~~~~~~~~~~~~~~~~~
 # This script will summarize all of the multi-variate and univariate survival analyses
-
 ############################################
+
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 # args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
 
diff --git a/5.Pathway/Scripts/A.GeneEnrichment.R b/5.Pathway/Scripts/A.GeneEnrichment.R
@@ -7,6 +7,9 @@
 # The script will output a table of the differentially expressed genes for each cluster, 
 # both positively and negatively regulated
 
+suppressMessages(library(checkpoint))
+suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
+
 args <- commandArgs(trailingOnly=TRUE)
 #args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
 
diff --git a/ANALYSIS.sh b/ANALYSIS.sh
@@ -40,6 +40,7 @@ SEED=123
 NSTARTS=20
 NO_SHUFFLE=FALSE
 SHUFFLE=TRUE
+SAM_SUBSET='commongenes'
 
 #################
 # PART ONE: 
@@ -54,7 +55,7 @@ SHUFFLE=TRUE
 # NOTE: The Mayo Clinic Data is not currently in curatedOvarianData.
 
 # Output the samples for each dataset that pass the inclusion criteria
-R --no-save --args < 1.DataInclusion/Scripts/A.getInclusion.R  # (Table 1)
+Rscript 1.DataInclusion/Scripts/A.getInclusion.R  # (Table 1)
 
 # Output the common genes and the MAD (Median Absolute Deviation) genes to be 
 # used in developing moderated t score vectors and in clustering, respectively. 
@@ -77,11 +78,19 @@ B.getGenes.R
 # ~~~~~~~~~~~~~~~~~~~~~
 
 # ~~~~~~~~~~~~~
-# k means & SAM
+# SAM with MAD genes
 # ~~~~~~~~~~~~~
-# Perform k means and SAM (args: kmin, kmax, nstarts, seed, bNMF) (Figure 1)
-R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE $DATASETS \
-"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
+# Output across dataset correlations for MAD genes
+# NOTE: common genes used in downstream analyses
+R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE "madgenes" \
+$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
+
+# ~~~~~~~~~~~~~
+# k means & SAM (with common genes)
+# ~~~~~~~~~~~~~
+# Perform k means and SAM (Figure 1)
+R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE $SAM_SUBSET \
+$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
 
 # Output correlation matrices (Sup. Fig. S2)	
 R --no-save --args $KMIN $KMAX $SEED Figures/CorrelationMatrix/ $DATASETS \
@@ -92,8 +101,8 @@ R --no-save --args $KMIN $KMAX $DATASETS < 2.Clustering_DiffExprs/Scripts/\
 C.KMeansBarCharts.R
 
 # Shuffle genes to compare across population correlations in real data
-R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $SHUFFLE $DATASETS \
-"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
+R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $SHUFFLE $SAM_SUBSET \
+$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
 
 # ~~~~~~~~~~~~~
 # NMF
@@ -104,8 +113,8 @@ R --no-save --args $KMIN $KMAX $NSTARTS $SEED $DATASETS "GSE26712_eset" \
 < 2.Clustering_DiffExprs/Scripts/D.NMF.R
 
 # Run SAM on NMF clusters (TRUE argument forces NMF analysis)
-R --no-save --args $KMIN $KMAX $NSTARTS $SEED TRUE $NO_SHUFFLE $DATASETS \
-"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
+R --no-save --args $KMIN $KMAX $NSTARTS $SEED TRUE $NO_SHUFFLE $SAM_SUBSET \
+$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
 
 # ~~~~~~~~~~~~~
 # k means vs. NMF
@@ -125,7 +134,7 @@ F.clusterMembership.R
 R --no-save < 2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R
 
 # ~~~~~~~~~~~~~
-#  Tothill LMP  #
+# Tothill LMP
 # ~~~~~~~~~~~~~
 # Observe consensus matrices and cophenetic coefficients for Tothill dataset if 
 # LMP samples are not removed. This is similar to the results presented by TCGA 
diff --git a/INSTALL.R b/INSTALL.R

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-.DS_Store`
	`1`	`+*.DS_Store`
	`2`	`+Rplots.pdf`