-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdo_all_RUN_ME_TO_SEE_RNA_SEQ_PROCESSING.sh
executable file
·108 lines (90 loc) · 6.01 KB
/
do_all_RUN_ME_TO_SEE_RNA_SEQ_PROCESSING.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
# Train and then test (with held out samples) for TCGA RNA-Seq samples
#
#
# PREQEQUISITE:
#
# The sarc RNA-Seq dataset must first be downloaded (from NIH TCGA data repository) and pre-processed. Use the following command.
#
# ./gdc-fetch.sh sarc TCGA-SARC_case_filter GLOBAL_file_filter_UQ
#
# Don't use the TCGA download tool - the downloaded files have to extensively pre-processed, and MY gdc_fetch does that in addition to downloading them
#
# "-A 4" means that only cancer subtypes 0,1,2,3 and 4, corresponding to tubular, intest_nos, stomach_nos, diffuse, mucinous, will be used. Subtypes 5 and 6 (, signet, papillary) which have a very small number of examples, are excluded.
#
# A lot of console output will be seen. This is deliberate, to enable you to see what's going on. You can suppress console output using the DEBUG_LEVEL_xxx parameters in 'do_all.sh'
#
#
# OUTPUTS:
#
# Tensorboard outputs are visible via your browser at: http://localhost:6006/
# provided you first start the tensorboard server using a command similar to this: tensorboard --logdir=classi/runs --samples_per_plugin images=0 --reload_interval=10
# *** Note the two tabs in Tensorboard: 'scalar' and 'graphics'
#
# Log file outputs (including some graphical outputs that aren't visible via tensorboard) will be found in '<BASE_DIR>/logs'
#
# defaults for use if user doesn't set an option:
HIGHEST_CLASS_NUMBER=6
BATCH_SIZE=16
DIVIDE_CASES="True"
HIDDEN_LAYER_NEURONS=2100
LEARNING_RATE=.00001
MAKE_BALANCED="level_up"
N_EPOCHS=100
N_SAMPLES=98765
NN_DENSE_DROPOUT_1="0.2"
NN_TYPE_RNA="DENSE"
#~ REGEN="True"
REGEN="True"
REPEAT=1
SKIP_GENERATION="False"
while getopts A:b:g:h:H:L:o:r:R:S:v:z:7: option # weird choice of letters is because they're the same as used in do_all.sh, where they are a few out of dozens of parameters
do
case "${option}"
in
A) HIGHEST_CLASS_NUMBER=${OPTARG};;
b) BATCH_SIZE=${OPTARG};;
g) SKIP_GENERATION=${OPTARG};;
h) MAKE_BALANCED=${OPTARG};;
H) HIDDEN_LAYER_NEURONS=${OPTARG};;
L) LEARNING_RATE=${OPTARG};;
o) N_EPOCHS=${OPTARG};;
r) REGEN=${OPTARG};;
R) REPEAT=${OPTARG};;
S) N_SAMPLES=${OPTARG};;
v) DIVIDE_CASES=${OPTARG};;
z) NN_TYPE_RNA=${OPTARG};;
7) NN_DENSE_DROPOUT_1=${OPTARG};;
esac
done
echo ""
echo "========================================================================================================================================================================"
echo "TRAINING RUN"
echo "========================================================================================================================================================================"
echo ""
./do_all.sh -d sarc -i rna -z ${NN_TYPE_RNA} -H ${HIDDEN_LAYER_NEURONS} -L ${LEARNING_RATE} -7 ${NN_DENSE_DROPOUT_1} -b ${BATCH_SIZE} -o ${N_EPOCHS} -A ${HIGHEST_CLASS_NUMBER} -c UNIMODE_CASE -v ${DIVIDE_CASES} -r ${REGEN}
sleep 5
# The test run will use the best model produced during training on the held out 20% test examples (20% is the default. Change it with the PCT_TEST option if you want. Eg. for 10%, use '-1 .1'
echo ""
echo "========================================================================================================================================================================"
echo "TEST RUN"
echo "========================================================================================================================================================================"
echo ""
./do_all.sh -d sarc -i rna -z ${NN_TYPE_RNA} -H ${HIDDEN_LAYER_NEURONS} -b ${BATCH_SIZE} -c UNIMODE_CASE -A ${HIGHEST_CLASS_NUMBER} -j True
#
# Meaning of options:
#
# DATASET -d Which named dataset to use (eg. ‘sarc’). Must exist locally.
# INPUT_MODE -i Type of input data: allowed values are image, rna and image_rna (always lower case)
# TILES_PER_IMAGE -f Number of tiles to extract from each Whole Slide Image
# NN_TYPE_IMG -a neural network model to be used for Image processing (e.g. VGG16, INCEPT4, RESNET152)
# BATCH_SIZE -b Number of examples per batch
# TILE_SIZE -T Size of tile to be used (pixels x pixels)
# N_EPOCHS -o Number of iterations to use during training. Each iteration pushes all N_SAMPLES (or every sample if N_SAMPLES is not specified) through the neural network, in mini-batches of size BATCH_SIZE
# CASES -c Specify a named subset of cases to be used for a particular experiment in the same or subsequent experiments. Used in conjunction with the DIVIDE_CASES parameter
# HIGHEST_CLASS_NUMBER -A Include only classes (cancer subtypes) with class numbers in the range 0 through HIGHEST_CLASS_NUMBER.
# This parameter may be used to instruct the platform to ignore certain classes – for example “Not Otherwise Specified” cancer types.
# This requires planning: classes which the experimenter may later wish to ignore in certain experiments should be given the highest class numbers.
# Class numbers are specified in the applicable master spreadsheet, in this case 'sarc_mapping_file_MASTER.csv' in folder '<BASE_DIR>/global/sarc_global'
# DIVIDE_CASES -v Divide the working directory of cases into pre-defined subsets, which may then or subsequently be selected for use in an experiment by means of the ‘CASES flags’
# PCT_TEST -1 (Numeral 1) Proportion of examples to be held out for testing. (Number in the range 0.0 to 1.0, despite the name)