Skip to content

Commit

Permalink
Reference pop data
Browse files Browse the repository at this point in the history
  • Loading branch information
markdrussell committed Feb 21, 2025
1 parent c2f7f08 commit e20e67e
Show file tree
Hide file tree
Showing 5 changed files with 283 additions and 2 deletions.
192 changes: 192 additions & 0 deletions analysis/000_baseline_data_midpoint.do
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
version 16

/*==============================================================================
DO FILE NAME: Incidence graphs
PROJECT: OpenSAFELY Disease Incidence project
DATE: 23/08/2024
AUTHOR: J Galloway / M Russell
DESCRIPTION OF FILE: Baseline data for reference population
DATASETS USED: Dataset definition
OTHER OUTPUT: logfiles, printed to folder $Logdir
USER-INSTALLED ADO:
(place .ado file(s) in analysis folder)
==============================================================================*/

*Set filepaths
*global projectdir "C:\Users\Mark\OneDrive\PhD Project\OpenSAFELY Incidence\disease_incidence"
*global projectdir "C:\Users\k1754142\OneDrive\PhD Project\OpenSAFELY Incidence\disease_incidence"
global projectdir `c(pwd)'
di "$projectdir"

capture mkdir "$projectdir/output/data"
capture mkdir "$projectdir/output/tables"
capture mkdir "$projectdir/output/figures"

global logdir "$projectdir/logs"
di "$logdir"

*Open a log file
cap log close
log using "$logdir/baseline_data_midpoint.log", replace

*Set Ado file path
adopath + "$projectdir/analysis/extra_ados"

*Import dataset
import delimited "$projectdir/output/dataset_definition_midpoint.csv", clear

set scheme plotplainblind

*Create and label variables ===========================================================*/

**Age
lab var age "Age"
codebook age
keep if age !=.

**Sex
gen gender = 1 if sex == "female"
replace gender = 2 if sex == "male"
lab var gender "Gender"
lab define gender 1 "Female" 2 "Male", modify
lab val gender gender
tab gender, missing
keep if gender !=.
drop sex

**Ethnicity
gen ethnicity_n = 1 if ethnicity == "White"
replace ethnicity_n = 2 if ethnicity == "Asian or Asian British"
replace ethnicity_n = 3 if ethnicity == "Black or Black British"
replace ethnicity_n = 4 if ethnicity == "Mixed"
replace ethnicity_n = 5 if ethnicity == "Chinese or Other Ethnic Groups"
replace ethnicity_n = 6 if ethnicity == "Unknown"


label define ethnicity_n 1 "White" ///
2 "Asian or Asian British" ///
3 "Black or Black British" ///
4 "Mixed" ///
5 "Chinese or Other Ethnic Groups" ///
6 "Unknown", modify

label values ethnicity_n ethnicity_n
lab var ethnicity_n "Ethnicity"
tab ethnicity_n, missing
drop ethnicity
rename ethnicity_n ethnicity

**IMD
gen imd = 1 if imd_quintile == "1 (most deprived)"
replace imd = 2 if imd_quintile == "2"
replace imd = 3 if imd_quintile == "3"
replace imd = 4 if imd_quintile == "4"
replace imd = 5 if imd_quintile == "5 (least deprived)"
replace imd = 6 if imd_quintile == "Unknown"

label define imd 1 "1 (most deprived)" 2 "2" 3 "3" 4 "4" 5 "5 (least deprived)" 6 "Unknown", modify
label values imd imd
lab var imd "Index of multiple deprivation"
tab imd, missing
drop imd_quintile

save "$projectdir/output/data/reference_data_processed.dta", replace

/*Tables================================================================*/

use "$projectdir/output/data/reference_data_processed.dta", clear

**Baseline table for reference population
preserve
table1_mc, total(before) onecol nospacelowpercent missing iqrmiddle(",") ///
vars(age contn %5.1f \ ///
gender cat %5.1f \ ///
ethnicity cat %5.1f \ ///
imd cat %5.1f \ ///
)
restore

**Rounded and redacted baseline table for full population
clear *
save "$projectdir/output/data/reference_table_rounded.dta", replace emptyok
use "$projectdir/output/data/reference_data_processed.dta", clear

set type double

foreach var of varlist imd ethnicity gender {
preserve
contract `var'
local v : variable label `var'
gen variable = `"`v'"'
decode `var', gen(categories)
gen count = round(_freq, 5)
egen total = total(count)
gen percent = round((count/total)*100, 0.1)
order total, after(percent)
gen countstr = string(count)
replace countstr = "<8" if count<=7
order countstr, after(count)
drop count
rename countstr count
tostring percent, gen(percentstr) force format(%9.1f)
replace percentstr = "-" if count =="<8"
order percentstr, after(percent)
drop percent
rename percentstr percent
gen totalstr = string(total)
replace totalstr = "-" if count =="<8"
order totalstr, after(count)
drop total
rename totalstr total
gen cohort = "All"
order cohort, first
list cohort variable categories count percent total
keep cohort variable categories count percent total
append using "$projectdir/output/data/reference_table_rounded.dta"
save "$projectdir/output/data/reference_table_rounded.dta", replace
restore
}
use "$projectdir/output/data/reference_table_rounded.dta", clear
export excel "$projectdir/output/tables/reference_table_rounded.xls", replace sheet("Overall") keepcellfmt firstrow(variables)

**Table of mean age
clear *
save "$projectdir/output/data/reference_mean_age_rounded.dta", replace emptyok
use "$projectdir/output/data/reference_data_processed.dta", clear

preserve
collapse (count) count=age (mean) mean_age=age (sd) stdev_age=age
rename *count freq
gen count = round(freq, 5)
gen countstr = string(count)
replace countstr = "<8" if count<=7
order countstr, after(count)
drop count
rename countstr count
tostring mean_age, gen(meanstr) force format(%9.1f)
replace meanstr = "-" if count =="<8"
drop mean_age
rename meanstr mean_age
tostring stdev_age, gen(stdevstr) force format(%9.1f)
replace stdevstr = "-" if count =="<8"
order stdevstr, after(stdev_age)
drop stdev_age
rename stdevstr stdev_age
order count, first
list count mean_age stdev_age
keep count mean_age stdev_age
append using "$projectdir/output/data/reference_mean_age_rounded.dta"
save "$projectdir/output/data/reference_mean_age_rounded.dta", replace
restore

use "$projectdir/output/data/reference_mean_age_rounded.dta", clear
export excel "$projectdir/output/tables/reference_mean_age_rounded.xls", replace keepcellfmt firstrow(variables)

***Output tables as CSVs
import excel "$projectdir/output/tables/reference_table_rounded.xls", clear
export delimited using "$projectdir/output/tables/reference_table_rounded.csv", novarnames replace

import excel "$projectdir/output/tables/reference_mean_age_rounded.xls", clear
export delimited using "$projectdir/output/tables/reference_mean_age_rounded.csv", novarnames replace

log close
9 changes: 7 additions & 2 deletions analysis/001_baseline_data.do
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,23 @@ set scheme plotplainblind

*Create and label variables ===========================================================*/

**Age
codebook age_reg
keep if age_reg !=.

**Sex
gen gender = 1 if sex == "female"
replace gender = 2 if sex == "male"
lab var gender "Gender"
lab define gender 1 "Female" 2 "Male", modify
lab val gender gender
tab gender, missing
keep if gender !=.
drop sex

**Ethnicity
gen ethnicity_n = 1 if ethnicity == "White"
replace ethnicity_n = 2 if ethnicity == "Asian or Asian British" /* mixed to 6 */
replace ethnicity_n = 2 if ethnicity == "Asian or Asian British"
replace ethnicity_n = 3 if ethnicity == "Black or Black British"
replace ethnicity_n = 4 if ethnicity == "Mixed"
replace ethnicity_n = 5 if ethnicity == "Chinese or Other Ethnic Groups"
Expand Down Expand Up @@ -150,7 +155,7 @@ table1_mc, total(before) onecol nospacelowpercent missing iqrmiddle(",") ///
)
restore

**Baseline table for reference population
**Baseline table by disease
foreach disease in $diseases {
preserve
keep if `disease'==1
Expand Down
52 changes: 52 additions & 0 deletions analysis/dataset_definition_midpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from ehrql import create_dataset, days, months, years, case, when, create_measures, INTERVAL, minimum_of, maximum_of
from ehrql.tables.tpp import patients, medications, practice_registrations, clinical_events, apcs, addresses, ons_deaths, appointments
from ehrql.codes import ICD10Code
from datetime import date, datetime
import codelists_ehrQL as codelists

dataset = create_dataset()
dataset.configure_dummy_data(population_size=1000)

index_date = "2020-08-01"

# Demographics
dataset.age = patients.age_on(index_date)
dataset.sex = patients.sex

# Currently registered at mid-point
any_registration = practice_registrations.for_patient_on(index_date).exists_for_patient()

# Define patient ethnicity
latest_ethnicity_code = (
clinical_events.where(clinical_events.snomedct_code.is_in(codelists.ethnicity_codes))
.where(clinical_events.date.is_on_or_before(index_date))
.sort_by(clinical_events.date)
.last_for_patient().snomedct_code.to_category(codelists.ethnicity_codes)
)

dataset.ethnicity = case(
when(latest_ethnicity_code == "1").then("White"),
when(latest_ethnicity_code == "2").then("Mixed"),
when(latest_ethnicity_code == "3").then("Asian or Asian British"),
when(latest_ethnicity_code == "4").then("Black or Black British"),
when(latest_ethnicity_code == "5").then("Chinese or Other Ethnic Groups"),
otherwise="Unknown",
)

# Define patient IMD
imd = addresses.for_patient_on(index_date).imd_rounded

dataset.imd_quintile = case(
when((imd >= 0) & (imd < int(32844 * 1 / 5))).then("1 (most deprived)"),
when(imd < int(32844 * 2 / 5)).then("2"),
when(imd < int(32844 * 3 / 5)).then("3"),
when(imd < int(32844 * 4 / 5)).then("4"),
when(imd < int(32844 * 5 / 5)).then("5 (least deprived)"),
otherwise="Unknown",
)

# Define population as any registered patient after index date - then apply further restrictions later
dataset.define_population(
any_registration
& dataset.sex.is_in(["male", "female"])
)
16 changes: 16 additions & 0 deletions generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
highly_sensitive:
cohort: output/dataset_definition.csv
generate_dataset_midpoint:
run: ehrql:v1 generate-dataset analysis/dataset_definition_midpoint.py
--output output/dataset_definition_midpoint.csv
outputs:
highly_sensitive:
cohort: output/dataset_definition_midpoint.csv
# generate_dataset_data_avail:
# run: ehrql:v1 generate-dataset analysis/dataset_definition_data_avail.py
# --output output/dataset_definition_data_avail.csv
Expand All @@ -35,6 +42,15 @@
# log1: logs/data_avail_tables.log
# data1: output/tables/data_check_*.csv
run_baseline_data_midpoint:
run: stata-mp:latest analysis/000_baseline_data_midpoint.do
needs: [generate_dataset_midpoint]
outputs:
moderately_sensitive:
log1: logs/baseline_data_midpoint.log
table1: output/tables/reference_table_rounded.csv
table2: output/tables/reference_mean_age_rounded.csv
run_baseline_data:
run: stata-mp:latest analysis/001_baseline_data.do
needs: [generate_dataset]
Expand Down
16 changes: 16 additions & 0 deletions project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ actions:
highly_sensitive:
cohort: output/dataset_definition.csv

generate_dataset_midpoint:
run: ehrql:v1 generate-dataset analysis/dataset_definition_midpoint.py
--output output/dataset_definition_midpoint.csv
outputs:
highly_sensitive:
cohort: output/dataset_definition_midpoint.csv

# generate_dataset_data_avail:
# run: ehrql:v1 generate-dataset analysis/dataset_definition_data_avail.py
# --output output/dataset_definition_data_avail.csv
Expand All @@ -32,6 +39,15 @@ actions:
# log1: logs/data_avail_tables.log
# data1: output/tables/data_check_*.csv

run_baseline_data_midpoint:
run: stata-mp:latest analysis/000_baseline_data_midpoint.do
needs: [generate_dataset_midpoint]
outputs:
moderately_sensitive:
log1: logs/baseline_data_midpoint.log
table1: output/tables/reference_table_rounded.csv
table2: output/tables/reference_mean_age_rounded.csv

run_baseline_data:
run: stata-mp:latest analysis/001_baseline_data.do
needs: [generate_dataset]
Expand Down

0 comments on commit e20e67e

Please sign in to comment.