-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02_utility_functions.R
186 lines (165 loc) · 5.54 KB
/
02_utility_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# 1. Functions to calculate statistics
library(data.table)
possible_feature_columns = c("PeptideSequence", "PeptideModifiedSequence",
"PrecursorCharge", "Charge", "FragmentIon", "ProductCharge")
count_features = function(df) {
cols = intersect(colnames(df), possible_feature_columns)
uniqueN(df[, cols, with = FALSE])
}
count_proteins = function(df) {
uniqueN(df[, "ProteinName", with = FALSE])
}
count_peptides = function(df) {
pep_col = intersect(colnames(df), c("PeptideSequence", "PeptideModifiedSequence"))
uniqueN(df[, pep_col, with = FALSE])
}
get_features_per_protein = function(df) {
paste_ = function(...) paste(..., sep = "_")
# df = as.data.table(as(as(df, "MSstatsValidated"), "data.frame"))
cols = intersect(colnames(df), possible_feature_columns)
df$feature = do.call("paste_", as.list(df[, cols, with = FALSE]))
df[, .(n_features = uniqueN(feature)), by = "ProteinName"]
}
get_mean_features_per_protein = function(features_df) {
mean(features_df$n_features, na.rm = TRUE)
}
count_missing_values = function(df) {
sum(is.na(df$Intensity), na.rm = TRUE)
}
count_infinite = function(df) {
sum(!is.finite(df$Intensity))
}
count_zero_values = function(df) {
sum(abs(df$Intensity) < 1e-6, na.rm = TRUE)
}
count_exactly_zero = function(df) {
sum(df$Intensity == 0, na.rm = TRUE)
}
get_ram_size = function(df) {
pryr::object_size(df)
}
get_disk_size = function(file_path) {
file.info(file_path)$size / 1e6
}
count_rows = function(df) {
nrow(df)
}
count_cols = function(df) {
ncol(df)
}
count_measurements = function(df) {
paste_ = function(...) paste(..., sep = "_")
cols = intersect(colnames(df), possible_feature_columns)
df$feature = do.call("paste_", as.list(df[, cols, with = FALSE]))
count_by = intersect(colnames(df), c("ProteinName", cols, "Run", "Channel"))
df[, .(n_measurement = .N), by = count_by]
}
count_fractions = function(df) {
if (is.element("Fraction", colnames(df))) {
uniqueN(df$Fraction)
} else {
NA
}
}
count_design = function(df, col) {
if (is.element(col, colnames(df))) {
uniqueN(df[, col, with = FALSE])
} else {
NA
}
}
count_larger_than = function(df, threshold) {
sum(df$Intensity > threshold, na.rm = TRUE)
}
count_01 = function(df) {
sum(!is.na(df$Intensity) & df$Intensity > 0 & df$Intensity <= 1)
}
getStatsSingleVersion = function(df, version) {
data.table(
version = version,
n_features = count_features(df),
n_proteins = count_proteins(df),
n_peptides = count_peptides(df),
n_infinite = count_infinite(df),
n_missing = count_missing_values(df),
n_zero = count_zero_values(df),
n_exactly_zero = count_exactly_zero(df),
n_greater_than_0 = count_larger_than(df, 0),
n_greater_than_1 = count_larger_than(df, 1),
n_between_0_1 = count_01(df),
n_rows = count_rows(df),
n_cols = count_cols(df),
n_conditions = count_design(df, "Condition"),
n_bioreps = count_design(df, "BioReplicate"),
n_runs = count_design(df, "Run"),
n_tech_replicates = count_design(df, "TechRep"),
n_tech_rep_mixture = count_design(df, "TechRepMixture"),
n_fractions = count_fractions(df),
n_channels = count_design(df, "Channel"))
}
getStats = function(v3, v4) {
rbindlist(
list(getStatsSingleVersion(as.data.table(v3), "v3"),
getStatsSingleVersion(as.data.table(as(v4, "data.frame")), "v4"))
)
# list(class(v3), class(v4))
}
# files = list.files("./single_outputs", full.names = TRUE)
# statistics = lapply(files, function(path) {
# result = tryCatch({
# df = as.data.table(readRDS(path))
# list(stats = data.table(
# n_features = count_features(df),
# n_proteins = count_proteins(df),
# n_peptides = count_peptides(df),
# n_missing = count_missing_values(df),
# n_rows = count_rows(df),
# n_cols = count_cols(df),
# n_conditions = count_design(df, "Condition"),
# n_conditions = count_design(df, "BioReplicate"),
# n_runs = count_design(df, "Run"),
# n_tech_replicates = count_design(df, "TechRep"),
# n_tech_rep_mixture = count_design(df, "TechRepMixture"),
# n_channels = count_design(df, "Channel")),
# n_features_per_protein = get_features_per_protein(df),
# n_measurements = count_measurements(df)
# )
#
# }, error = function(e) data.table())
# return(result)
# })
# saveRDS(statistics, "statistics.RDS")
# statistics
#
# simple_stats = lapply(statistics, function(x) {
# if(length(x) == 3) {
# x[[1]]
# } else {
# data.table()
# }
# })
#
# simple_stats_full = lapply(1:length(simple_stats), function(x) {
# df = simple_stats[[x]]
# df$path = files[x]
# df
# })
#
# rbindlist(simple_stats_full, fill = TRUE)
#
# path = files[1]
# df = as.data.table(readRDS(path))
# data.table(
# n_features = count_features(df),
# n_proteins = count_proteins(df),
# n_peptides = count_peptides(df),
# n_features_per_protein = get_features_per_protein(df),
# n_missing = count_missing_values(df),
# n_rows = count_rows(df),
# n_cols = count_cols(df),
# n_measurements = count_measurements(df),
# n_conditions = count_design(df, "Condition"),
# n_conditions = count_design(df, "BioReplicate"),
# n_runs = count_design(df, "Run"),
# n_tech_replicates = count_design(df, "TechRep")
# )