Skip to content

Commit c88c9d2

Browse files
authored
Merge pull request #7 from gwaygenomics/master
Checkpoint Package and Custom SAM subsetting
2 parents 5c26aa0 + 0c26fce commit c88c9d2

21 files changed

+96
-112
lines changed

.gitignore

100644100755
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
.DS_Store
1+
*.DS_Store
2+
Rplots.pdf

1.DataInclusion/Scripts/A.getInclusion.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# This script will perform our inclusion criteria on all datasets included
88
# in curatedOvarianData and a dataset from the Mayo Clinic
99

10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
####################################
1114
# Load Libraries
1215
####################################

1.DataInclusion/Scripts/B.getGenes.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# Output gene lists (common genes and MAD genes) and
88
# Overlapping Genes Venn Diagram: Supplemental Figure 1
99

10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
args <- commandArgs(trailingOnly = T)
1114
#args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1215
################################
@@ -119,4 +122,4 @@ venn.plot <- venn.diagram(x = list('TCGA' = TCGA_venn,
119122
cat.pos = c(0, -20, 20, -20, 20),
120123
cat.dist = c(.19, .21, -.18, -.2, .21),
121124
margin = .1, cex = 0.8,
122-
main.cex = 2)
125+
main.cex = 2)

1.DataInclusion/Scripts/processMayoEset/Agilent1and2and3_COMBAT_datamerge.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
## Modified date : 19 September 2015 12:00 PM EST
1919
########################################################
2020

21+
suppressMessages(library(checkpoint))
22+
suppressMessages(checkpoint("2016-03-01", checkpointLocation = "."))
23+
2124
######################## RUN under R version 2.15.0 (2012-03-30)
2225
######################## TESTED under R version 3.1.2 (2014-10-31)
2326
###################################################################
@@ -171,4 +174,4 @@ sum(is.na(comb.mtx2))
171174
#[1] 29815
172175

173176
# Write comb.mtx99 to file
174-
write.table(comb.mtx2, "1.DataInclusion/Data/Mayo/COMBATadj_withNAcy5cy3.tsv", sep = "\t")
177+
write.table(comb.mtx2, "1.DataInclusion/Data/Mayo/COMBATadj_withNAcy5cy3.tsv", sep = "\t")

1.DataInclusion/Scripts/processMayoEset/createMayoEset.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# This script will take the normalized matrix from the Mayo Data and output
88
# an eset to be used in subsequent analyses
99

10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
##########################
1114
# Load Libraries
1215
##########################

2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
# across k and across datasets, it will output cluster membership files and
99
# correlations
1010

11+
suppressMessages(library(checkpoint))
12+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
13+
1114
args <- commandArgs(trailingOnly=TRUE)
12-
#args <- c(2, 4, 20, 123, FALSE, TRUE, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
15+
#args <- c(2, 4, 20, 123, FALSE, FALSE, "madgenes", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1316
############################################
1417
# Load Libraries
1518
############################################
@@ -39,6 +42,7 @@ kStarts <- as.numeric(paste(args[3]))
3942
kSEED <- as.numeric(paste(args[4]))
4043
bNMF <- as.logical(args[5])
4144
shuffle <- as.logical(args[6])
45+
SAM_subset <- args[7]
4246

4347
# Separate the eset arguments from the rest of the commandArgs
4448
argsCurated <- args[grep("eset", args)]
@@ -55,7 +59,7 @@ if("Mayo" %in% args) {
5559
}
5660

5761
# Use the LoadOVCA_Data function to read in the datasets subset by commongenes
58-
ExpData <- LoadOVCA_Data(datasets = argsCurated, genelist_subset = "commongenes", shuffle = shuffle)
62+
ExpData <- LoadOVCA_Data(datasets = argsCurated, genelist_subset = SAM_subset, shuffle = shuffle)
5963

6064
# Read in common genes csv file. The csv was generated as an intersection of the genes in
6165
# TCGA, Yoshihara, Mayo, Tothill, and Bonome
@@ -494,11 +498,13 @@ for (centroid in 1:length(Dlist.mapped)) {
494498
if (!bNMF) {
495499
write.table(WithinDatasetCor[[centroid]], file =
496500
paste("2.Clustering_DiffExprs/Tables/WithinCor/", names(WithinDatasetCor)[centroid],
497-
"WithinDatasetCorrelations.csv", sep = ""), row.names = T, col.names = NA, sep = ",")
501+
'_', SAM_subset, "_WithinDatasetCorrelations.csv", sep = ""),
502+
row.names = T, col.names = NA, sep = ",")
498503
} else {
499504
write.table(WithinDatasetCor[[centroid]], file =
500505
paste("2.Clustering_DiffExprs/Figures/nmf/WithinCor/", names(WithinDatasetCor)[centroid],
501-
"nmf_WithinDatasetCorrelations.csv", sep = ""), row.names = T, col.names = NA, sep = ",")
506+
'_', SAM_subset, "nmf_WithinDatasetCorrelations.csv", sep = ""),
507+
row.names = T, col.names = NA, sep = ",")
502508
}
503509
}
504510
}
@@ -532,10 +538,12 @@ for (centroid in 1:length(Dlist.mapped.cor)) {
532538
if (!shuffle) {
533539
if (!bNMF) {
534540
write.table(tmpCor, file = paste("2.Clustering_DiffExprs/Tables/AcrossCor/AcrossDatasetCor_K",
535-
krange[centroid], ".csv", sep = ""), sep = ",", row.names = T, col.names = NA)
541+
krange[centroid], "_", SAM_subset, ".csv", sep = ""),
542+
sep = ",", row.names = T, col.names = NA)
536543
} else {
537544
write.table(tmpCor, file = paste("2.Clustering_DiffExprs/Figures/nmf/AcrossCor/AcrossDatasetCor_nmf_K",
538-
krange[centroid], ".csv", sep = ""), sep = ",", row.names = T, col.names = NA)
545+
krange[centroid], "_", SAM_subset, ".csv", sep = ""),
546+
sep = ",", row.names = T, col.names = NA)
539547
}
540548
}
541549
}

2.Clustering_DiffExprs/Scripts/B.CorrelationMatrix.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# This script will input a series of datasets and output sample by sample
88
# correlation matrix heatmaps
99

10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
args <- commandArgs(trailingOnly=TRUE)
1114
#args <- c(2, 4, 123, "Figures/CorrelationMatrix/", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1215
###########################################

2.Clustering_DiffExprs/Scripts/C.KMeansBarCharts.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will input a series of datasets and output bar charts
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly=TRUE)
1013
# args <- c("2", "4", "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1114

2.Clustering_DiffExprs/Scripts/D.NMF.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will input a series of datasets and perform Non-negative Matrix Factorization (NMF)
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly = TRUE)
1013
# args <- c(2, 4, 100, 123, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1114
################################

2.Clustering_DiffExprs/Scripts/E.kmeans_v_nmf.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will compare clusters identified by k-means and NMF
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly=TRUE)
1013
# args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
1114

2.Clustering_DiffExprs/Scripts/F.clusterMembership.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will output a data frame of cluster membership for all clustering events
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly=TRUE)
1013
#args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset", "GSE26712_eset")
1114

2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# identified in the 2008 Clin Cancer Research Paper, and the original Konecny
1010
# subtypes identified in the 2014 JNCI paper
1111

12+
suppressMessages(library(checkpoint))
13+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
14+
1215
library(curatedOvarianData)
1316

1417
############################################

2.Clustering_DiffExprs/Scripts/H.TCGA_LMP_TothillPrediction.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
# except without removing LMP samples to contrast our biological filtering
99
# findings
1010

11+
suppressMessages(library(checkpoint))
12+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
13+
1114
set.seed(123)
1215

1316
################################

2.Clustering_DiffExprs/Scripts/H.TCGA_SupFig6.2_TothillPrediction.R

Lines changed: 0 additions & 88 deletions
This file was deleted.

3.Fit/Scripts/A.GoodnessFit.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will output the AIC, BIC, and Silhouette Width plots
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly=TRUE)
1013
# args <- c(2, 8, 20, 20, 123, "TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
1114
############################################

3.Fit/Scripts/B.GAP_GoodnessFit.R

100644100755
File mode changed.

4.Survival/Scripts/A.Survival.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will output kaplan-meier curves and survival assessments for all input datasets
88

9+
suppressMessages(library(checkpoint))
10+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
11+
912
args <- commandArgs(trailingOnly=TRUE)
1013

1114
# args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")

4.Survival/Scripts/B.Summarize_Survival.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55
# Konecny, G., Goode, E., Greene, C.S., Doherty, J.A.
66
# ~~~~~~~~~~~~~~~~~~~~~
77
# This script will summarize all of the multi-variate and univariate survival analyses
8-
98
############################################
9+
10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
args <- commandArgs(trailingOnly=TRUE)
1114
# args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
1215

5.Pathway/Scripts/A.GeneEnrichment.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# The script will output a table of the differentially expressed genes for each cluster,
88
# both positively and negatively regulated
99

10+
suppressMessages(library(checkpoint))
11+
suppressMessages(checkpoint('2016-03-01', checkpointLocation = "."))
12+
1013
args <- commandArgs(trailingOnly=TRUE)
1114
#args <- c("TCGA_eset", "Mayo", "GSE32062.GPL6480_eset", "GSE9891_eset")
1215

ANALYSIS.sh

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ SEED=123
4040
NSTARTS=20
4141
NO_SHUFFLE=FALSE
4242
SHUFFLE=TRUE
43+
SAM_SUBSET='commongenes'
4344

4445
#################
4546
# PART ONE:
@@ -54,7 +55,7 @@ SHUFFLE=TRUE
5455
# NOTE: The Mayo Clinic Data is not currently in curatedOvarianData.
5556

5657
# Output the samples for each dataset that pass the inclusion criteria
57-
R --no-save --args < 1.DataInclusion/Scripts/A.getInclusion.R # (Table 1)
58+
Rscript 1.DataInclusion/Scripts/A.getInclusion.R # (Table 1)
5859

5960
# Output the common genes and the MAD (Median Absolute Deviation) genes to be
6061
# used in developing moderated t score vectors and in clustering, respectively.
@@ -77,11 +78,19 @@ B.getGenes.R
7778
# ~~~~~~~~~~~~~~~~~~~~~
7879

7980
# ~~~~~~~~~~~~~
80-
# k means & SAM
81+
# SAM with MAD genes
8182
# ~~~~~~~~~~~~~
82-
# Perform k means and SAM (args: kmin, kmax, nstarts, seed, bNMF) (Figure 1)
83-
R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE $DATASETS \
84-
"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
83+
# Output across dataset correlations for MAD genes
84+
# NOTE: common genes used in downstream analyses
85+
R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE "madgenes" \
86+
$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
87+
88+
# ~~~~~~~~~~~~~
89+
# k means & SAM (with common genes)
90+
# ~~~~~~~~~~~~~
91+
# Perform k means and SAM (Figure 1)
92+
R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE $SAM_SUBSET \
93+
$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
8594

8695
# Output correlation matrices (Sup. Fig. S2)
8796
R --no-save --args $KMIN $KMAX $SEED Figures/CorrelationMatrix/ $DATASETS \
@@ -92,8 +101,8 @@ R --no-save --args $KMIN $KMAX $DATASETS < 2.Clustering_DiffExprs/Scripts/\
92101
C.KMeansBarCharts.R
93102

94103
# Shuffle genes to compare across population correlations in real data
95-
R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $SHUFFLE $DATASETS \
96-
"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
104+
R --no-save --args $KMIN $KMAX $NSTARTS $SEED FALSE $SHUFFLE $SAM_SUBSET \
105+
$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
97106

98107
# ~~~~~~~~~~~~~
99108
# NMF
@@ -104,8 +113,8 @@ R --no-save --args $KMIN $KMAX $NSTARTS $SEED $DATASETS "GSE26712_eset" \
104113
< 2.Clustering_DiffExprs/Scripts/D.NMF.R
105114

106115
# Run SAM on NMF clusters (TRUE argument forces NMF analysis)
107-
R --no-save --args $KMIN $KMAX $NSTARTS $SEED TRUE $NO_SHUFFLE $DATASETS \
108-
"GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
116+
R --no-save --args $KMIN $KMAX $NSTARTS $SEED TRUE $NO_SHUFFLE $SAM_SUBSET \
117+
$DATASETS "GSE26712_eset" < 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R
109118

110119
# ~~~~~~~~~~~~~
111120
# k means vs. NMF
@@ -125,7 +134,7 @@ F.clusterMembership.R
125134
R --no-save < 2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R
126135

127136
# ~~~~~~~~~~~~~
128-
# Tothill LMP #
137+
# Tothill LMP
129138
# ~~~~~~~~~~~~~
130139
# Observe consensus matrices and cophenetic coefficients for Tothill dataset if
131140
# LMP samples are not removed. This is similar to the results presented by TCGA

0 commit comments

Comments
 (0)