generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_yaml.py
168 lines (143 loc) · 5.61 KB
/
generate_yaml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
diseases = ["asthma", "copd", "chd", "stroke", "heart_failure", "dementia", "multiple_sclerosis", "epilepsy", "crohns_disease", "ulcerative_colitis", "dm_type2", "ckd", "psoriasis", "atopic_dermatitis", "osteoporosis", "rheumatoid", "depression", "coeliac", "pmr"]
# diseases = ["rheumatoid", "pmr"]
yaml_header = """
version: '3.0'
expectations:
population_size: 1000
actions:
generate_dataset:
run: ehrql:v1 generate-dataset analysis/dataset_definition.py
--output output/dataset_definition.csv
#--
#--diseases "{diseases}"
outputs:
highly_sensitive:
cohort: output/dataset_definition.csv
generate_dataset_demographics_disease:
run: ehrql:v1 generate-dataset analysis/dataset_definition_demographics_disease.py
--output output/dataset_definition_demographics_disease.csv
outputs:
highly_sensitive:
cohort: output/dataset_definition_demographics_disease.csv
# generate_dataset_data_avail:
# run: ehrql:v1 generate-dataset analysis/dataset_definition_data_avail.py
# --output output/dataset_definition_data_avail.csv
# #--
# #--diseases "{diseases}"
# outputs:
# highly_sensitive:
# cohort: output/dataset_definition_data_avail.csv
# run_data_avail:
# run: stata-mp:latest analysis/101_data_availability.do
# needs: [generate_dataset_data_avail]
# outputs:
# moderately_sensitive:
# log1: logs/data_avail_tables.log
# data1: output/tables/data_check_*.csv
"""
yaml_demog = """
generate_baseline_data_{year}:
run: ehrql:v1 generate-dataset analysis/dataset_definition_demographics.py
--output output/dataset_definition_{year}.csv
--
--start-date "{year}-04-01"
outputs:
highly_sensitive:
cohort: output/dataset_definition_{year}.csv
"""
yaml_prebody = ""
all_need = []
for year in range(2016, 2025):
yaml_prebody += yaml_demog.format(year=year)
all_need.append(f"generate_baseline_data_{year}")
need_list = ", ".join(all_need)
yaml_template = """
measures_dataset_{disease}_{year}:
run: ehrql:v1 generate-measures analysis/dataset_definition_measures.py
--output output/measures/measures_dataset_{disease}_{year}.csv
--
--start-date "{year}-04-01"
--intervals {intervals}
--disease "{disease}"
needs: [generate_dataset]
outputs:
highly_sensitive:
measure_csv: output/measures/measures_dataset_{disease}_{year}.csv
"""
yaml_body = ""
all_needs = []
for year in range(2016, 2025):
intervals = 8 if year == 2024 else 12 # Set intervals conditionally
for disease in diseases:
yaml_body += yaml_template.format(disease=disease, year=year, intervals=intervals)
all_needs.append(f"measures_dataset_{disease}_{year}")
needs_list = ", ".join(all_needs)
yaml_footer_template = f"""
run_baseline_data_reference:
run: stata-mp:latest analysis/000_baseline_data_reference.do
needs: [{need_list}]
outputs:
moderately_sensitive:
log1: logs/baseline_data_reference.log
table1: output/tables/reference_table_rounded.csv
run_baseline_data_reference_all:
run: stata-mp:latest analysis/003_baseline_data_reference_all.do
needs: [generate_dataset_demographics_disease]
outputs:
moderately_sensitive:
log1: logs/baseline_data_reference_all.log
table1: output/tables/reference_table_rounded_all.csv
run_baseline_data_disease:
run: stata-mp:latest analysis/001_baseline_data_disease.do
needs: [generate_dataset_demographics_disease]
outputs:
moderately_sensitive:
log1: logs/baseline_data_disease.log
table1: output/tables/baseline_table_rounded.csv
table3: output/tables/incidence_count_*.csv
figure1: output/figures/count_inc_*.svg
run_data_processing:
run: stata-mp:latest analysis/002_processing_data.do
needs: [generate_dataset, {needs_list}]
outputs:
moderately_sensitive:
log1: logs/processing_data.log
table1: output/tables/redacted_counts_*.csv
run_incidence_graphs:
run: stata-mp:latest analysis/100_incidence_graphs.do
needs: [run_data_processing]
outputs:
moderately_sensitive:
log1: logs/descriptive_tables.log
figure1: output/figures/inc_comp_*.svg
figure2: output/figures/prev_comp_*.svg
figure3: output/figures/prev_adj_*.svg
figure4: output/figures/inc_adj_*.svg
figure5: output/figures/adj_sex_*.svg
figure6: output/figures/unadj_age_*.svg
figure7: output/figures/unadj_ethn_*.svg
figure8: output/figures/unadj_imd_*.svg
table1: output/tables/arima_standardised.csv
run_sarima:
run: r:latest analysis/200_sarima.R
needs: [run_incidence_graphs]
outputs:
moderately_sensitive:
log1: logs/sarima_log.txt
figure1: output/figures/raw_pre_covid_*.svg
figure2: output/figures/differenced_pre_covid_*.svg
figure3: output/figures/seasonal_pre_covid_*.svg
figure4: output/figures/raw_acf_*.svg
figure5: output/figures/differenced_acf_*.svg
figure6: output/figures/seasonal_acf_*.svg
figure7: output/figures/auto_residuals_*.svg
figure8: output/figures/obs_pred_*.svg
table1: output/tables/change_incidence_byyear.csv
table2: output/tables/values_*.csv
"""
yaml_footer = yaml_footer_template.format(needs_list=needs_list)
# Combine header, body, and footer
generated_yaml = yaml_header + yaml_prebody + yaml_body + yaml_footer
# Save to a file
with open("project.yaml", "w") as file:
file.write(generated_yaml)