add get_top_phenos

neurogenomics · May 17, 2024 · 98e7d0a · 98e7d0a
1 parent 71d544b
commit 98e7d0a
Show file tree

Hide file tree

Showing 13 changed files with 441 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -51,7 +51,8 @@ Suggests:
     scales,
     piggyback,
     patchwork,
-    tidygraph
+    tidygraph,
+    colorspace
 Remotes:
   github::neurogenomics/KGExplorer
 VignetteBuilder: knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,7 +27,9 @@ export(filter_descendants)
 export(get_gene_lists)
 export(get_hpo)
 export(get_hpo_id_direct)
+export(get_top_phenos)
 export(gpt_annot_check)
+export(gpt_annot_class)
 export(gpt_annot_codify)
 export(gpt_annot_melt)
 export(gpt_annot_plot)
@@ -48,6 +50,7 @@ export(newlines_to_definition)
 export(per_branch_plot)
 export(phenos_to_granges)
 export(plot_evidence)
+export(plot_top_phenos)
 export(search_hpo)
 import(KGExplorer)
 import(data.table)
@@ -70,3 +73,4 @@ importFrom(tools,R_user_dir)
 importFrom(utils,data)
 importFrom(utils,download.file)
 importFrom(utils,head)
+importFrom(utils,tail)
diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R
@@ -10,7 +10,7 @@
 #' phenos2 <- add_gpt_annotations(phenos)
 add_gpt_annotations <- function(phenos,
                                 annot = gpt_annot_codify(
-                                  reset_tiers_dict=TRUE
+                                  reset_weights_dict=TRUE
                                   )$annot_weighted,
                                 annot_cols = names(annot)[
                                   !names(annot) %in% c("hpo_id","hpo_name")

diff --git a/R/get_top_phenos.R b/R/get_top_phenos.R
@@ -0,0 +1,69 @@
+#' Get top phenotypes
+#'
+#' Get the most severe phenotypes per severity class:
+#'  Profound, Severe, Moderate, Mild.
+#' The exception to this is the "Mild" class, where the \emph{least}
+#'  severe phenotypes will be taken instead of the most severe phenotypes.
+#' @inheritParams add_ont_lvl
+#' @inheritParams add_ancestor
+#' @inheritParams plot_top_phenos
+#' @importFrom utils head tail
+#' @export
+#' @examples
+#' top_phenos <- get_top_phenos()
+get_top_phenos <- function(res_class = gpt_annot_class(),
+                           keep_ont_levels = seq(3,17),
+                           keep_descendants = "Phenotypic abnormality",
+                           n_per_class = 10,
+                           annotation_order=NULL,
+                           split_by_congenital=TRUE){
+  congenital_onset <- is_congenital <- severity_class <- value <- NULL;
+
+  if(is.null(annotation_order)){
+    weights_dict <- eval(formals(gpt_annot_codify)$weights_dict)
+    annotation_order <- gsub("_"," ",names(sort(unlist(weights_dict),
+                                                decreasing = TRUE)))
+  }
+  res_class[,is_congenital:=(congenital_onset %in% c(2,3))]
+  annot_melt <- data.table::melt.data.table(res_class,
+                                            id.vars = c("severity_class",
+                                                        "is_congenital",
+                                                        "hpo_id",
+                                                        "hpo_name",
+                                                        "severity_score_gpt"))
+  annot_melt <- annot_melt[!is.na(value),]
+  #### Filter out ont levels ####
+  annot_melt <- HPOExplorer::add_ancestor(annot_melt,
+                                          keep_descendants = keep_descendants)
+  data.table::setorderv(annot_melt,"severity_score_gpt",-1)
+  annot_melt <- HPOExplorer::add_ont_lvl(annot_melt,
+                                         keep_ont_levels = keep_ont_levels)
+  annot_melt$variable <- gsub("_", " ", annot_melt$variable)
+  annot_melt$variable <- factor(annot_melt$variable, levels = annotation_order)
+
+
+  if(split_by_congenital){
+    dat_congenital <- lapply(unique(annot_melt$severity_class), function(s){
+      sort_fun <- if(s=="mild") utils::tail else utils::head
+      annot_melt[severity_class== s &
+                   is_congenital==TRUE,
+                 sort_fun(.SD,n_per_class), by="variable"]
+    })|> data.table::rbindlist()
+    dat_noncongenital <- lapply(unique(annot_melt$severity_class), function(s){
+      sort_fun <- if(s=="mild") tail else head
+      annot_melt[severity_class== s &
+                   is_congenital==FALSE,
+                 sort_fun(.SD,n_per_class), by="variable"]
+    })|> data.table::rbindlist()
+    return(list(congenital=dat_congenital,
+                noncongenital=dat_noncongenital)
+    )
+  } else {
+    dat <- lapply(unique(annot_melt$severity_class), function(s){
+      sort_fun <- if(s=="mild") tail else head
+      annot_melt[severity_class== s,
+                 sort_fun(.SD,n_per_class), by="variable"]
+    })|> data.table::rbindlist()
+    return(dat)
+  }
+}
diff --git a/R/gpt_annot_check.R b/R/gpt_annot_check.R
@@ -111,6 +111,7 @@ gpt_annot_check <- function(annot = gpt_annot_read(),
   checks <- list(
     annot=annot,
     annot_mean=annot_mean,
+    annot_stringent_mean=annot_stringent_mean,
     consistency_count=consistency_count,
     consistency_rate=consistency_rate,
     consistency_stringent_count=consistency_count,

diff --git a/R/gpt_annot_class.R b/R/gpt_annot_class.R
@@ -0,0 +1,104 @@
+#' GPT annotations: add severity class
+#'
+#' Convert severity annotations ot severity classes
+#'  (profound > severe > moderate > mild) to approximate those introduced in
+#'  \href{https://doi.org/10.1371/journal.pone.0114391}{Lazarin et al. 2014}.
+#'
+#' @param res_coded The output of list \link{gpt_annot_codify}.
+#' @param tiers_dict A named list of severity tiers.
+#' @param inclusion_values Numerically encoded annotation values
+#' to count as hits. See the \code{gpt_annot_codify(code_dict=)}
+#'  argument defaults for the mappings.
+#'  Defaults to 2 ("often") and 3 ("always").
+#' @param add_severity_score_gpt Whether to add a continuous severity score
+#' as well.
+#' @returns A data.table with severity classes, as well as severity
+#'  scores (0-100) which can be used to rank severity within each class.
+#' @export
+#' @examples
+#' res_coded <- gpt_annot_codify()
+#' res_class <- gpt_annot_class(res_coded)
+gpt_annot_class <- function(res_coded = gpt_annot_codify(),
+                            inclusion_values=c(2,3),
+                            tiers_dict = list(
+                              ## Tier 1
+                              death=1,
+                              intellectual_disability=1,
+                              # congenital_onset=1,
+                              ## Tier 2
+                              impaired_mobility=2,
+                              physical_malformations=2,
+                              ## Tier 3
+                              blindness=3,
+                              sensory_impairments=3,
+                              immunodeficiency=3,
+                              cancer=3,
+                              ## Tier 4
+                              reduced_fertility=4
+                            ),
+                            add_severity_score_gpt=TRUE){
+  severity_class <- NULL;
+  map_severity_class <- function(r,
+                                 tiers_dict,
+                                 inclusion_values,
+                                 return_score=FALSE){
+    variable <- hpo_name <- value <- NULL;
+
+    tiers <- unique(unlist(tiers_dict))
+    tier_scores <- lapply(stats::setNames(tiers,paste0("tier",tiers)),
+                          function(x){
+                            tx <- tiers_dict[unname(unlist(tiers_dict)==x)]
+                            counts <- r[,sapply(.SD, function(v){
+                              v %in% inclusion_values
+                              }),
+                              .SDcols = names(tx)]
+                            list(
+                              counts=counts,
+                              proportion=sum(counts)/length(tx)
+                            )
+                          })
+    mean_proportion <- sapply(tier_scores, function(x)x$proportion)|>mean()
+    assigned_class <- if(sum(tier_scores$tier1$counts)>1){
+      c("profound"=mean_proportion)
+    } else if (sum(tier_scores$tier1$counts)>0 ||
+               sum(c(tier_scores$tier2$counts,tier_scores$tier3$counts))>3){
+      c("severe"=mean_proportion)
+    } else if(sum(tier_scores$tier3$counts)>0){
+      c("moderate"=mean_proportion)
+    } else{
+      c("mild"=mean_proportion)
+    }
+    if(return_score){
+      return(assigned_class)
+    } else{
+      return(names(assigned_class))
+    }
+  }
+  res_class <- data.table::copy(res_coded$annot_coded)
+  messager("Assigning severity classes.")
+  res_class[,severity_class:=map_severity_class(.SD,
+                                                inclusion_values = inclusion_values,
+                                                tiers_dict = tiers_dict), by=.I]
+  res_class[,severity_class:=factor(severity_class,
+                                    levels = c("profound","severe",
+                                               "moderate","mild"),
+                                    ordered = TRUE)]
+  # messager("Assigning severity class scores.")
+  # res_class[,severity_class_score:=map_severity_class(.SD,
+  #                                                     tiers_dict = tiers_dict,
+  #                                                     inclusion_values = inclusion_values,
+  #                                                     return_score = TRUE), by=.I]
+  if(isTRUE(add_severity_score_gpt) &&
+     "annot_weighted" %in% names(res_coded) &&
+     !"severity_score_gpt" %in% names(res_class)){
+    res_class <- merge(res_class,
+                       res_coded$annot_weighted[,c("hpo_name",
+                                                   "severity_score_gpt")])
+    data.table::setorderv(res_class,
+                          c("severity_class","severity_score_gpt"),c(1,-1))
+  } else{
+    data.table::setorderv(res_class,
+                          c("severity_class"))
+  }
+  return(res_class)
+}
diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R
@@ -3,8 +3,8 @@
 #' Check GPT phenotype annotations using a several metrics.
 #' @param remove_duplicates Ensure only 1 row per phenotype.
 #' @param code_dict Numerical encodings of annotation values.
-#' @param tiers_dict Numerical encodings of annotation column.
-#' @param reset_tiers_dict Override \code{tiers_dict} values and set all values
+#' @param weights_dict Weights to be applied to each annotation metric.
+#' @param reset_weights_dict Override \code{weights_dict} values and set all values
 #' to 1. This will ensure that all annotations are unweighted.
 #' @inheritParams gpt_annot_check
 #' @inheritParams KGExplorer::filter_dt
@@ -24,9 +24,9 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
                                "often"=2,
                                "always"=3
                              ),
-                             tiers_dict=list(
+                             weights_dict=list(
+                               death=6,
                                intellectual_disability=5,
-                               death=5,
                                impaired_mobility=4,
                                physical_malformations=3,
                                blindness=4,
@@ -36,18 +36,18 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
                                reduced_fertility=1,
                                congenital_onset=1
                              ),
-                             reset_tiers_dict=FALSE,
+                             reset_weights_dict=FALSE,
                              filters=list()
                              ){
   severity_score_gpt <- hpo_name <- NULL;
 
   d <- data.table::copy(annot)
-  if(isTRUE(reset_tiers_dict)) tiers_dict <- lapply(tiers_dict,function(x){1})
+  if(isTRUE(reset_weights_dict)) weights_dict <- lapply(weights_dict,function(x){1})
   #### Ensure only 1 row/hpo_name by simply taking the first ####
   if(isTRUE(remove_duplicates)){
     d <- d[,utils::head(.SD,1), by=c("hpo_id","hpo_name")]
   }
-  cols <- names(tiers_dict)
+  cols <- names(weights_dict)
   #### Add levels ####
   d <- d[,lapply(.SD,function(x){
     factor(tolower(x),levels = rev(names(code_dict)), ordered = TRUE)
@@ -61,14 +61,14 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
   max_score <-
     sum(
       max(code_dict, na.rm = TRUE) *
-      (max(unlist(tiers_dict))*length(tiers_dict))
+      (max(unlist(weights_dict))*length(weights_dict))
     )
   d_coded <- d[,lapply(.SD,FUN=function(x){
     unlist(code_dict[tolower(x)])}),.SDcols = cols, by=c("hpo_id","hpo_name")]
   d_weighted <- data.table::as.data.table(
     lapply(stats::setNames(cols,cols),
            function(co){
-           d_coded[[co]]*tiers_dict[[co]]
+           d_coded[[co]]*weights_dict[[co]]
              })
   )[,hpo_name:=d_coded$hpo_name][,severity_score_gpt:=(
     rowSums(.SD,na.rm = TRUE)/max_score*100),