From 02be20108ea31e8eacfc9ff4847ea0f8922bce9a Mon Sep 17 00:00:00 2001 From: "Brian M. Schilder" <34280215+bschilder@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:28:14 +0000 Subject: [PATCH] Fix gpt annot functions [skip ci] --- R/gpt_annot_check.R | 69 ++++++++++++++++++----- R/gpt_annot_check_plot.R | 118 +++++++++++++++++++++++++++++++++------ R/gpt_annot_codify.R | 14 ++--- R/gpt_annot_read.R | 10 ++-- R/plot_evidence.R | 2 +- man/gpt_annot_check.Rd | 10 ++++ man/gpt_annot_read.Rd | 7 ++- 7 files changed, 185 insertions(+), 45 deletions(-) diff --git a/R/gpt_annot_check.R b/R/gpt_annot_check.R index 939c917..a7ffa63 100644 --- a/R/gpt_annot_check.R +++ b/R/gpt_annot_check.R @@ -6,6 +6,10 @@ #' generated by \link[HPOExplorer]{search_hpo}. #' These will be used as the ground truth when trying to identify #' true positive annotations. +#' @param response_map A named list of responses to map onto. +#' Used for standardisation of responses. +#' @param pos_values Positive values. +#' @param neg_values Negative values. #' @param verbose Print messages. #' @returns Named list #' @@ -16,12 +20,15 @@ #' checks <- gpt_annot_check() gpt_annot_check <- function(annot = gpt_annot_read(), query_hits = search_hpo(), + response_map = list( + no="never", + yes="always" + ), + pos_values=c("sometimes","often","always"), + neg_values = c("never","rarely"), verbose = TRUE ){ - - # path="~/Downloads/gpt_hpo_annotations.csv" pheno_count <- NULL; - #### Proportion of HPO_IDs annotated before/after chatGPT #### # hpo <- get_hpo() # prior_ids <- unique(HPOExplorer::hpo_modifiers$hpo_id) @@ -30,15 +37,33 @@ gpt_annot_check <- function(annot = gpt_annot_read(), # length(prior_ids)/length(hpo@terms) # length(new_ids)/length(hpo@terms) #### Check annotation consistency #### - nm <- names(annot)[!grepl("hpo_name|justification|hpo_id",names(annot), + nm <- names(annot)[!grepl("hpo_name|justification|hpo_id|hpo_name|pheno_count",names(annot), ignore.case = TRUE)] - counts <- table(tolower(unlist(annot[,nm,with=FALSE])), useNA = "always") - neg_values <- c("never","no") - opts <- unlist(sapply(annot[,nm,with=FALSE], unique)) |> unique() + response_counts <- table(tolower(unlist(annot[,nm,with=FALSE])), useNA = "always") + # opts <- unlist(sapply(annot[,nm,with=FALSE], unique)) |> unique() + + #### Standardise responses #### + for(n in nm){ + annot[,(n):=tolower(get(n))] + annot[,(n):=ifelse(get(n) %in% names(response_map),response_map[[get(n)]], get(n)), by=.I] + annot[,(n):=ifelse(get(n) %in% c(pos_values,neg_values),get(n),NA), by=.I] + } + response_counts_standard <- table(tolower(unlist(annot[,nm,with=FALSE])), useNA = "always") + #### Compute number of non-negative answers within each column#### - annot_mean <- annot[pheno_count>1][,lapply(.SD,function(x){ - mean(!tolower(x) %in% neg_values) - }),.SDcols=nm,by="hpo_name"] + if(nrow(annot[pheno_count>1])==0){ + messager("No duplicate phenotypes to check consistency for.") + annot_mean <- NULL + }else { + #### Check for relaxed consistency (only distguish between positive and negative )#### + annot_mean <- annot[pheno_count>1][,lapply(.SD,function(x){ + mean(!na.omit(x) %in% neg_values) + }),.SDcols=nm,by="hpo_name"] + #### Check for stringent consistency #### + annot_stringent_mean <- annot[pheno_count>1][,lapply(.SD,function(x){ + data.table::uniqueN(x)==1 + }),.SDcols=nm,by="hpo_name"] + } #### Check ontology classifications ##### annot_check <- lapply(seq(nrow(annot)), function(i){ r <- annot[i,] @@ -48,7 +73,7 @@ gpt_annot_check <- function(annot = gpt_annot_read(), names(query_hits)), function(x){ if(r$hpo_id %in% query_hits[[x]]){ - !tolower(r[,x,with=FALSE][[1]]) %in% neg_values + tolower(r[,x,with=FALSE][[1]]) %in% pos_values } else { NA } @@ -57,8 +82,16 @@ gpt_annot_check <- function(annot = gpt_annot_read(), }) |> data.table::rbindlist() #### Compute consistency within each column #### - annot_consist <- sapply(annot_mean[,-1], - function(x)sum(x%in%c(0,1)/nrow(annot_mean))) + if(!is.null(annot_mean)){ + consistency_count <- sapply(annot_mean[,-c("hpo_name")], + function(x)nrow(annot_mean)) + consistency_rate <- sapply(annot_mean[,-c("hpo_name")], + function(x)sum(x%in%c(0,1)/nrow(annot_mean))) + consistency_stringent_rate <- sapply(annot_stringent_mean[,-c("hpo_name")], + function(x)sum(x)/nrow(annot_mean)) + } else { + consistency_count <- consistency_rate <- consistency_stringent_rate <- NULL + } ### Proportion of rows where annotation is not NA checkable_rate <- sapply( annot_check[,names(query_hits),with=FALSE], @@ -78,12 +111,18 @@ gpt_annot_check <- function(annot = gpt_annot_read(), checks <- list( annot=annot, annot_mean=annot_mean, - annot_consist=annot_consist, + consistency_count=consistency_count, + consistency_rate=consistency_rate, + consistency_stringent_count=consistency_count, + consistency_stringent_rate=consistency_stringent_rate, annot_check=annot_check, checkable_rate=checkable_rate, checkable_count=checkable_count, + true_pos_count=checkable_count, true_pos_rate=true_pos_rate, - false_neg_rate=false_neg_rate + false_neg_rate=false_neg_rate, + response_counts=response_counts, + response_counts_standard=response_counts_standard ) #### Plot #### checks[["plot"]] <- gpt_annot_check_plot(checks=checks) diff --git a/R/gpt_annot_check_plot.R b/R/gpt_annot_check_plot.R index e1abd64..31e1958 100644 --- a/R/gpt_annot_check_plot.R +++ b/R/gpt_annot_check_plot.R @@ -1,29 +1,113 @@ gpt_annot_check_plot <- function(checks, - items = c("annot_consist", - "checkable_count", - #"checkable_rate", - "true_pos_rate")){ + items = c("consistency_count","consistency_rate", + "consistency_stringent_count","consistency_stringent_rate", + # "checkable_count","checkable_rate", + "true_pos_count","true_pos_rate"), + metric_types=c("Rate","Count")[1], + scales = "free"){ requireNamespace("ggplot2") - annotation <- value <- metric <- NULL; + annotation <- metric <- metric_category <- metric_type <- NULL; check_df <- lapply(checks[items], data.table::as.data.table, keep.rownames = TRUE) |> data.table::rbindlist(idcol = "metric") |> data.table::setnames(c("metric","annotation","value")) - check_df$metric <- factor(check_df$metric, levels = items, ordered = TRUE) - ggplot2::ggplot(check_df[annotation!="pheno_count"], - ggplot2::aes(x=annotation, y=value, fill=metric)) + - ggplot2::geom_bar(stat="identity") + - ggplot2::facet_grid(rows="metric", - scales = "free_y",) + + check_df$metric <- factor(check_df$metric, + levels = items, + ordered = TRUE) + check_df$annotation <- factor(check_df$annotation, + levels = unique(check_df$annotation), + ordered = TRUE) + # check_df[,row_count:=.N[!is.na(value)], by=c("metric","annotation")] + check_df[,metric_type:=ifelse(grepl("count",metric), + "Count",ifelse(grepl("rate",metric),"Rate",NA))] + check_df[,metric_category:=gsub("_count|_rate","",metric)] + check_df$metric_type <- factor(check_df$metric_type, ordered = TRUE) + check_df <- check_df[annotation!="pheno_count"] + combos <- expand.grid( + metric_category=check_df$metric_category, + metric_type=check_df$metric_type, + annotation=check_df$annotation + ) |> unique() |> `rownames<-`(NULL) + check_df2 <- merge(check_df, + combos, + by=c("metric_category","metric_type","annotation"), + all.y=TRUE) + # check_df[,label:=paste0("n=",value[metric_type=="Count"]), + # by=c("metric_category","annotation")] + check_df[,n:=value[metric_type=="Count"], + by=c("metric_category","annotation")] + + plt <- ggplot2::ggplot(check_df[metric_type %in% metric_types], + ggplot2::aes(x=annotation, y=value, + fill=n, + label=round(value,2))) + + ggplot2::geom_bar(stat="identity", show.legend = TRUE) + + # ggplot2::scale_fill_viridis_d(drop = FALSE, end = .8 )+ + ggplot2::scale_fill_viridis_c()+ + ggplot2::scale_x_discrete(drop = scales %in% c("free","free_x")) + + ggplot2::facet_grid(facets=metric_type~metric_category, + scales = scales) + ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1), - strip.background = ggplot2::element_rect(fill = "grey20"), - strip.text = ggplot2::element_text(color = "white")) + - ggplot2::labs(title = "GPT annotation validation", - x = "Annotation column", - y = "Value", - fill = "Metric") + strip.background = ggplot2::element_rect(fill = "grey20"), + strip.text = ggplot2::element_text(color = "white")) + + ggplot2::labs(x = "Annotation", + y = "Value", + fill = "N phenotypes") + + ggplot2::geom_label(fill='white') + # ggplot2::geom_text(angle=90, size=3, hjust=1.2) + # plt + if(scales %in% c("free","free_x")){ + plt <- plt+ + ggplot2::geom_vline(data = check_df2[is.na(value)], + ggplot2::aes(xintercept=annotation), + # label="N/A", + color="red", linetype="solid", linewidth=3, alpha=0.25) + + ggplot2::geom_text(data = check_df2[is.na(value)][,label:="N/A"], + ggplot2::aes(x=annotation, label=label, y=2), + position=ggplot2::position_fill(vjust = 0.1), + color="red", alpha=0.5, angle=90) + } + # count_df <- check_df[metric_type=="Count",,drop=FALSE] + # rate_df <- check_df[metric_type=="Rate",,drop=FALSE][annotation!="pheno_count"] + # plts <- list() + # make_plot <- function(df, + # y_lab=NULL){ + # p <- ggplot2::ggplot(df, + # ggplot2::aes(x=annotation, y=value, + # fill=metric_type)) + + # ggplot2::geom_bar(stat="identity", show.legend = FALSE) + + # ggplot2::scale_fill_viridis_d(drop=FALSE, end = .8 )+ + # ggplot2::scale_x_discrete(drop = FALSE) + + # ggplot2::facet_grid(rows="metric") + + # ggplot2::theme_bw() + + # ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1), + # strip.background = ggplot2::element_rect(fill = "grey20"), + # strip.text = ggplot2::element_text(color = "white")) + + # ggplot2::labs(x = "Annotation", + # y = y_lab, + # fill = "Metric") + # } + # if(nrow(count_df)>0){ + # plts[["count"]] <- make_plot(count_df,y_lab = "Count") + # } + # if(nrow(rate_df)>0){ + # plts[["rate"]] <- make_plot(rate_df,y_lab = "Rate") + # } + # if(length(plts)>1){ + # plts[[1]] <- plts[[1]] + + # ggplot2::theme(axis.text.x = ggplot2::element_blank()) + # } + # + # plt <- patchwork::wrap_plots(plts, ncol = 1, + # guides = "collect") + + # patchwork::plot_layout(axis_titles = "collect", + # axes = "collect") + return(list( + data=check_df, + plot=plt + )) } diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R index 38f23d7..546a551 100644 --- a/R/gpt_annot_codify.R +++ b/R/gpt_annot_codify.R @@ -40,9 +40,7 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), reset_tiers_dict=FALSE, filters=list() ){ - # res <- gpt_annot_check(path="~/Downloads/gpt_hpo_annotations.csv") - # annot <- res$annot - severity_score_gpt <- congenital_onset <- hpo_name <- hpo_id <- NULL; + severity_score_gpt <- hpo_name <- NULL; d <- data.table::copy(annot) if(isTRUE(reset_tiers_dict)) tiers_dict <- lapply(tiers_dict,function(x){1}) @@ -70,12 +68,14 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), unlist(code_dict[tolower(x)])}),.SDcols = cols, by=c("hpo_id","hpo_name")] d_weighted <- data.table::as.data.table( lapply(stats::setNames(cols,cols), - function(co){d_coded[[co]]* - ((max(unlist(tiers_dict))+1)-tiers_dict[[co]]) }) - )[,hpo_id:=d_coded$hpo_id][,severity_score_gpt:=( + function(co){ + d_coded[[co]]* + ((max(unlist(tiers_dict))+1)-tiers_dict[[co]]) + }) + )[,hpo_name:=d_coded$hpo_name][,severity_score_gpt:=( rowSums(.SD,na.rm = TRUE)/max_score*100), .SDcols=cols][d_coded[,-cols,with=FALSE], - on="hpo_id"] |> + on="hpo_name"] |> data.table::setorderv("severity_score_gpt",-1, na.last = TRUE) |> unique() #### Order hpo_names by severity_score_gpt ##### diff --git a/R/gpt_annot_read.R b/R/gpt_annot_read.R index e753260..5ed8c33 100644 --- a/R/gpt_annot_read.R +++ b/R/gpt_annot_read.R @@ -3,6 +3,7 @@ #' Read in phenotype annotations generated by GPT and #' do some initial preprocessing (e.g. adding HPO IDs). #' @inheritParams main +#' @inheritParams make_ #' @param save_path Path to annotations CSV file. #' If the file does not exist, the data will be downloaded from GitHub. #' @param force_new If \code{TRUE}, the data will be downloaded from GitHub @@ -13,11 +14,13 @@ #' #' @export #' @examples -#' annot <- gpt_annot_read() +#' gpt_annot <- gpt_annot_read() gpt_annot_read <- function(save_path=file.path( KGExplorer::cache_dir(package = "HPOExplorer"), "gpt4_hpo_annotations.csv" ), + +phenotype_to_genes = load_phenotype_to_genes(), force_new=FALSE, hpo=get_hpo(), include_nogenes=TRUE, @@ -37,9 +40,8 @@ include_nogenes=TRUE, data.table::setnames(d,"phenotype","hpo_name") d <- add_hpo_id(d, hpo = hpo) #### Check phenotype names #### - annot <- load_phenotype_to_genes(verbose = verbose) d <- merge(d, - unique(annot[,c("hpo_id","hpo_name")]), + unique(phenotype_to_genes[,c("hpo_id","hpo_name")]), all.x = TRUE, by=c("hpo_name","hpo_id")) d <- data.frame(d) @@ -57,7 +59,7 @@ include_nogenes=TRUE, "phenotypes.", v=verbose) # phenotype_miss_rate <- - # length(d$phenotype[!d$phenotype %in% annot$hpo_name]) / + # length(d$phenotype[!d$phenotype %in% phenotype_to_genes$hpo_name]) / # length(d$phenotype) return(d) } diff --git a/R/plot_evidence.R b/R/plot_evidence.R index 36d300a..dd384cd 100644 --- a/R/plot_evidence.R +++ b/R/plot_evidence.R @@ -70,7 +70,7 @@ plot_evidence <- function(metric="evidence_score_sum", evidence_score_sum_min=min(get(metric),na.rm=TRUE), evidence_score_sum_max=max(get(metric),na.rm=TRUE), evidence_score_sum_mean=mean(get(metric),na.rm=TRUE), - evidence_score_sum_sd=sd(get(metric),na.rm=TRUE)), + evidence_score_sum_sd=stats::sd(get(metric),na.rm=TRUE)), by=c("hpo_id","gene_symbol")]|> data.table::setorderv(metric_sd, -1,na.last = TRUE) h5 <- plot_hist(gcc_phenotype_agg_sd[!is.na(get(metric_sd))], diff --git a/man/gpt_annot_check.Rd b/man/gpt_annot_check.Rd index 881da7a..0caef1a 100644 --- a/man/gpt_annot_check.Rd +++ b/man/gpt_annot_check.Rd @@ -7,6 +7,9 @@ gpt_annot_check( annot = gpt_annot_read(), query_hits = search_hpo(), + response_map = list(no = "never", yes = "always"), + pos_values = c("sometimes", "often", "always"), + neg_values = c("never", "rarely"), verbose = TRUE ) } @@ -18,6 +21,13 @@ generated by \link[HPOExplorer]{search_hpo}. These will be used as the ground truth when trying to identify true positive annotations.} +\item{response_map}{A named list of responses to map onto. +Used for standardisation of responses.} + +\item{pos_values}{Positive values.} + +\item{neg_values}{Negative values.} + \item{verbose}{Print messages.} } \value{ diff --git a/man/gpt_annot_read.Rd b/man/gpt_annot_read.Rd index a7ca7e6..21eec84 100644 --- a/man/gpt_annot_read.Rd +++ b/man/gpt_annot_read.Rd @@ -7,6 +7,7 @@ gpt_annot_read( save_path = file.path(KGExplorer::cache_dir(package = "HPOExplorer"), "gpt4_hpo_annotations.csv"), + phenotype_to_genes = load_phenotype_to_genes(), force_new = FALSE, hpo = get_hpo(), include_nogenes = TRUE, @@ -17,6 +18,10 @@ gpt_annot_read( \item{save_path}{Path to annotations CSV file. If the file does not exist, the data will be downloaded from GitHub.} +\item{phenotype_to_genes}{Output of +\link{load_phenotype_to_genes} mapping phenotypes +to gene annotations.} + \item{force_new}{If \code{TRUE}, the data will be downloaded from GitHub even if it already exists locally.} @@ -35,5 +40,5 @@ Read in phenotype annotations generated by GPT and do some initial preprocessing (e.g. adding HPO IDs). } \examples{ -annot <- gpt_annot_read() +gpt_annot <- gpt_annot_read() }