diff --git a/NAMESPACE b/NAMESPACE index f4e4564..b6a3b6e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ export(make_network_object) export(make_network_plot) export(make_phenos_dataframe) export(make_tiers) +export(map_disease) export(map_phenotypes) export(newlines_to_definition) export(per_branch_plot) diff --git a/R/0docs.R b/R/0docs.R index 37efd19..f669d0e 100644 --- a/R/0docs.R +++ b/R/0docs.R @@ -69,7 +69,6 @@ NULL #' Functions to add metadata to data.table objects. #' @family add_ #' @param agg_by Column to aggregate metadata by. -#' @param add_definitions Add disease definitions using \link{add_mondo}. #' @param gpt_filters A named list of filters to apply to the GPT annotations. #' @inheritParams main #' @inheritParams make_ diff --git a/R/add_disease.R b/R/add_disease.R index bc50741..e1dfbc9 100644 --- a/R/add_disease.R +++ b/R/add_disease.R @@ -1,4 +1,3 @@ -#' @describeIn add_ add_ #' Add diseases #' #' Annotate each HPO term with diseases that they are associated with. @@ -7,6 +6,8 @@ #' See #' \href{https://hpo-annotation-qc.readthedocs.io/en/latest/annotationFormat.html}{ #' here for column descriptions}. +#' @inheritParams add_ +#' @inheritParams map_disease #' #' @export #' @importFrom data.table merge.data.table @@ -15,40 +16,31 @@ #' phenos <- example_phenos() #' phenos2 <- add_disease(phenos = phenos) add_disease <- function(phenos, - # extra_cols = c("Evidence","Reference","Biocuration"), + phenotype_to_genes = load_phenotype_to_genes(), + hpo = get_hpo(), extra_cols = NULL, all.x = TRUE, - allow.cartesian = FALSE, - add_definitions = FALSE){ + use_api=FALSE, + workers=NULL, + allow.cartesian = FALSE){ if(!"hpo_id" %in% names(phenos)){ stp <- paste("hpo_id column must be present in phenos.") stop(stp) } - if(!all(c("disease_name","disease_id") %in% names(phenos))){ - messager("Annotating phenos with Disease") - annot <- load_phenotype_to_genes(3) - #### From disease_id #### - if("disease_name" %in% names(phenos)){ - return(phenos) - } - #### From hpo_id alone #### - by <- c("hpo_id","disease_id") - by <- by[by %in% names(phenos)] - #### Ensure there's only 1 row per Disease #### - annot <- annot[,unique(c("hpo_id","disease_name","disease_id", - extra_cols)), - with=FALSE][,.SD[1], by=c("disease_id","hpo_id")] - #### Merge #### - phenos <- data.table::merge.data.table( - phenos, - annot, - by = by, - all.x = all.x, - allow.cartesian = allow.cartesian) - } - #### Add disease definitions and Mondo ID mappings #### - if(isTRUE(add_definitions)){ - phenos <- add_mondo(phenos = phenos) + if(!"disease_id" %in% names(phenos)){ + phenos <- add_genes(phenos = phenos, + phenotype_to_genes = phenotype_to_genes, + hpo = hpo, + all.x = all.x, + allow.cartesian = allow.cartesian) } + phenos <- map_disease(dat=phenos, + id_col="disease_id", + fields=c("disease"), + use_api=use_api, + return_dat=TRUE, + all.x = all.x, + allow.cartesian = allow.cartesian, + workers=workers) return(phenos) } diff --git a/R/add_genes.R b/R/add_genes.R index ba2159d..803309a 100644 --- a/R/add_genes.R +++ b/R/add_genes.R @@ -17,7 +17,8 @@ add_genes <- function(phenos = NULL, phenotype_to_genes = load_phenotype_to_genes(), hpo = get_hpo(), - by = c("hpo_id","disease_id"), + by = c("hpo_id","hpo_name", + "disease_id","disease_name","disease_description"), gene_col = "gene_symbol", all.x = FALSE, allow.cartesian = FALSE){ @@ -44,19 +45,12 @@ add_genes <- function(phenos = NULL, #### Ensure necessary columns are in phenos #### phenos <- add_hpo_id(phenos = phenos, hpo = hpo) - phenos <- add_disease(phenos = phenos, - allow.cartesian = allow.cartesian) #### Add Gene col to data #### if(!"gene_symbol" %in% names(phenos)){ by <- by[by %in% names(phenos)] - ## Get gene annotations - annot <- unique( - phenotype_to_genes[,unique(c(by,"gene_symbol","ncbi_gene_id")), - with=FALSE] - ) - ## Merge with input data + # ## Merge with input data phenos <- data.table::merge.data.table(phenos, - annot, + phenotype_to_genes, by = by, all.x = all.x, allow.cartesian = allow.cartesian) diff --git a/R/data.R b/R/data.R index ea3c714..9d80489 100644 --- a/R/data.R +++ b/R/data.R @@ -205,3 +205,19 @@ #' @format data.table #' @usage data("disease_id_to_omop") "disease_id_to_omop" + +#' Human Phenotype Ontology: Disease ID to Disease Name and Disease Description +#' +#' @description +#' Mapping of HPO disease ID (disease_id) to Disease Name and +#' Disease Description. +#' @source +#' \code{ +#' dat <- load_phenotype_to_genes() +#' out <- map_disease(dat) +#' disease_map <- out[,list(disease_id,disease_name,disease_description)]|>unique() +#' usethis::use_data(disease_map, overwrite = TRUE) +#' } +#' @format data.table +#' @usage data("disease_map") +"disease_map" diff --git a/R/map_disease.R b/R/map_disease.R new file mode 100644 index 0000000..a637f97 --- /dev/null +++ b/R/map_disease.R @@ -0,0 +1,101 @@ +#' Map disease +#' +#' Map disease IDs (e.g. "OMIM:101200") to names (e.g. "Apert syndrome") +#' @inheritParams add_ +#' @param dat A data.table with a column of disease IDs. +#' @param id_col The name of the column with the disease IDs. +#' @param fields The fields to extract from the API response. +#' @param return_dat Return the data.table with the mapped fields. +#' @param use_api Use the API to get the disease names and descriptions. +#' Otherwise, use a cached data.table (\code{disease_map}). +#' @inheritParams KGExplorer::set_cores +#' @source \href{https://ontology.jax.org/api/network/docs}{HPO API docs} +#' @source \href{https://github.com/obophenotype/human-phenotype-ontology/issues/10232}{HPO GitHub Issue} +#' @export +#' @examples +#' dat <- HPOExplorer::load_phenotype_to_genes() +#' out <- map_disease(dat = dat, workers=1) +map_disease <- function(dat, + id_col="disease_id", + fields=c("disease","categories","genes")[1], + use_api=TRUE, + return_dat=FALSE, + workers=NULL, + all.x = TRUE, + allow.cartesian = FALSE + ){ + # res <- httr::GET( + # "https://ontology.jax.org/api/network/annotation/OMIM%3A101200", + # httr::add_headers(accept = "application/json") + # ) + # cont <- httr::content(res) + + # Define the URL and headers + if(!id_col %in% names(dat)){ + stop("id_col not found in dat.") + } + if(!all(c("disease_name","disease_description") %in% names(dat))){ + messager("Adding disease_name and disease_description.") + #### Slow but up-to-date #### + if(use_api || length(fields)>1){ + map_disease_i <- function(id){ + url <- URLencode( + ## encode URL + paste0("https://ontology.jax.org/api/network/annotation/",id) + ) + ## get content + cont <- jsonlite::fromJSON(url) + ## Extract names + cont$disease <- data.table::as.data.table(cont$disease) + cont$disease|> data.table::setnames( + c("name","description","mondoId"), + c("disease_name","disease_description","mondo_id")) + cont$genes <- data.table::as.data.table(cont$genes) + cont$categories <- lapply(cont$categories, function(x){ + data.table::data.table(x) + })|> + data.table::rbindlist(idcol = "hpo_group", fill=TRUE) |> + data.table::setnames(c("id","name"),c("hpo_id","hpo_name")) + names(cont$categories) <- gsub("[.]","_",names(cont$categories)) + + #### Return #### + if(length(fields)==1){ + return(cont[[fields]]) + } else { + return(cont) + } + } + #### Iterate #### + ids <- unique(dat[[id_col]]) + BPPARAM <- KGExplorer::set_cores(workers = workers) + res <- BiocParallel::bplapply(X = stats::setNames(ids,ids), + FUN = map_disease_i, + BPPARAM = BPPARAM) + if(length(fields)==1){ + res <- data.table::rbindlist(res, fill=TRUE) + if(return_dat){ + dat <- merge(dat, res, + by.x=id_col, + by.y = "id", + all.x = TRUE) + return(dat) + } + } + return(res) + #### Fast but potentially out-of-date #### + } else{ + disease_map <- KGExplorer::get_data_package(name = "disease_map", + package = "HPOExplorer") + dat <- data.table::merge.data.table( + dat, + disease_map, + by = "disease_id", + all.x = all.x, + allow.cartesian = allow.cartesian) + return(dat) + } + } else { + messager("disease_name and disease_description already in dat.") + return(dat) + } +} diff --git a/data/disease_map.rda b/data/disease_map.rda new file mode 100644 index 0000000..3124291 Binary files /dev/null and b/data/disease_map.rda differ diff --git a/man/add_.Rd b/man/add_.Rd index 9a7881d..2c33ef4 100644 --- a/man/add_.Rd +++ b/man/add_.Rd @@ -1,16 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/0docs.R, R/add_ancestor.R, R/add_death.R, -% R/add_disease.R, R/add_disease_genes.R, R/add_evidence.R, -% R/add_gene_frequency.R, R/add_genes.R, R/add_gpt_annotations.R, -% R/add_hpo_definition.R, R/add_hpo_id.R, R/add_hpo_name.R, -% R/add_info_content.R, R/add_mondo.R, R/add_ndisease.R, R/add_omop.R, -% R/add_onset.R, R/add_ont_lvl.R, R/add_pheno_frequency.R, R/add_prevalence.R, -% R/add_severity.R, R/add_tier.R +% R/add_disease_genes.R, R/add_evidence.R, R/add_gene_frequency.R, +% R/add_genes.R, R/add_gpt_annotations.R, R/add_hpo_definition.R, +% R/add_hpo_id.R, R/add_hpo_name.R, R/add_info_content.R, R/add_mondo.R, +% R/add_ndisease.R, R/add_omop.R, R/add_onset.R, R/add_ont_lvl.R, +% R/add_pheno_frequency.R, R/add_prevalence.R, R/add_severity.R, R/add_tier.R \name{add_} \alias{add_} \alias{add_ancestor} \alias{add_death} -\alias{add_disease} \alias{add_disease_genes} \alias{add_evidence} \alias{add_gene_frequency} @@ -48,14 +46,6 @@ add_death( agg_by = NULL ) -add_disease( - phenos, - extra_cols = NULL, - all.x = TRUE, - allow.cartesian = FALSE, - add_definitions = FALSE -) - add_disease_genes(phenos, all.x = TRUE, verbose = TRUE) add_evidence( @@ -81,7 +71,7 @@ add_genes( phenos = NULL, phenotype_to_genes = load_phenotype_to_genes(), hpo = get_hpo(), - by = c("hpo_id", "disease_id"), + by = c("hpo_id", "hpo_name", "disease_id", "disease_name", "disease_description"), gene_col = "gene_symbol", all.x = FALSE, allow.cartesian = FALSE @@ -217,14 +207,6 @@ data from both \code{x} and \code{y} are included in the output.} \item{agg_by}{Column to aggregate age of onset metadata by.} -\item{extra_cols}{Extra metadata columns from the"phenotype.hpoa" -annotations file to include. -See -\href{https://hpo-annotation-qc.readthedocs.io/en/latest/annotationFormat.html}{ -here for column descriptions}.} - -\item{add_definitions}{Add disease definitions using \link{add_mondo}.} - \item{verbose}{Print messages.} \item{evidence_score_threshold}{The minimum threshold of mean @@ -253,6 +235,7 @@ When not \code{"mondo"}, can supply multiple alternative databases to map to \item{\code{top_n}}{Top number of mappings to return per \code{top_by} grouping. Set to \code{NULL} to skip this step.} \item{\code{add_name}}{Logical, if TRUE, add mondo name column.} + \item{\code{add_definitions}}{logical, if TRUE, add mondo definition column.} \item{\code{ont}}{An ontology of class \link[simona]{ontology_DAG}.} \item{\code{terms}}{A vector of ontology term IDs.} \item{\code{remove_terms}}{Character vector of term IDs to exclude.} @@ -429,11 +412,6 @@ AgeOfDeath IDs and assigned "AgeOfDeath_score" values: \item{HP:0033765 }{"Death in late adulthood" (AgeOfDeath_score=8)} } -\item \code{add_disease()}: add_ -Add diseases - -Annotate each HPO term with diseases that they are associated with. - \item \code{add_disease_genes()}: add_ Add disease genes @@ -601,8 +579,6 @@ phenos <- example_phenos() phenos2 <- add_ancestor(phenos = phenos, lvl=5) phenos <- example_phenos() phenos2 <- add_death(phenos = phenos) -phenos <- example_phenos() -phenos2 <- add_disease(phenos = phenos) \dontrun{ phenos <- load_phenotype_to_genes() phenos2 <- add_severity(phenos = phenos) diff --git a/man/add_disease.Rd b/man/add_disease.Rd new file mode 100644 index 0000000..83b8f83 --- /dev/null +++ b/man/add_disease.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_disease.R +\name{add_disease} +\alias{add_disease} +\title{Add diseases} +\usage{ +add_disease( + phenos, + phenotype_to_genes = load_phenotype_to_genes(), + hpo = get_hpo(), + extra_cols = NULL, + all.x = TRUE, + use_api = FALSE, + workers = NULL, + allow.cartesian = FALSE +) +} +\arguments{ +\item{phenos}{A data.table containing HPO IDs and other metadata.} + +\item{phenotype_to_genes}{Output of +\link{load_phenotype_to_genes} mapping phenotypes +to gene annotations.} + +\item{hpo}{Human Phenotype Ontology object, +loaded from \link[KGExplorer]{get_ontology}.} + +\item{extra_cols}{Extra metadata columns from the"phenotype.hpoa" +annotations file to include. +See +\href{https://hpo-annotation-qc.readthedocs.io/en/latest/annotationFormat.html}{ +here for column descriptions}.} + +\item{all.x}{logical; if \code{TRUE}, rows from \code{x} which have no matching row +in \code{y} are included. These rows will have 'NA's in the columns that are usually +filled with values from \code{y}. The default is \code{FALSE} so that only rows with +data from both \code{x} and \code{y} are included in the output.} + +\item{use_api}{Get definitions from the HPO API, +as opposed to a static local dataset.} + +\item{workers}{Number (>1) or proportion (<1) of worker cores to use.} + +\item{allow.cartesian}{See \code{allow.cartesian} in \code{\link[data.table]{[.data.table}}.} +} +\description{ +Annotate each HPO term with diseases that they are associated with. +} +\examples{ +phenos <- example_phenos() +phenos2 <- add_disease(phenos = phenos) +} diff --git a/man/disease_map.Rd b/man/disease_map.Rd new file mode 100644 index 0000000..e003efd --- /dev/null +++ b/man/disease_map.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{disease_map} +\alias{disease_map} +\title{Human Phenotype Ontology: Disease ID to Disease Name and Disease Description} +\format{ +An object of class \code{data.table} (inherits from \code{data.frame}) with 8631 rows and 3 columns. +} +\source{ +\code{ + dat <- load_phenotype_to_genes() + out <- map_disease(dat) + disease_map <- out[,list(disease_id,disease_name,disease_description)]|>unique() + usethis::use_data(disease_map, overwrite = TRUE) + } +@format data.table +@usage data("disease_map") +} +\usage{ +disease_map +} +\description{ +Mapping of HPO disease ID (disease_id) to Disease Name and +Disease Description. +} +\keyword{datasets} diff --git a/man/map_disease.Rd b/man/map_disease.Rd new file mode 100644 index 0000000..6c6dc61 --- /dev/null +++ b/man/map_disease.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/map_disease.R +\name{map_disease} +\alias{map_disease} +\title{Map disease} +\source{ +\href{https://ontology.jax.org/api/network/docs}{HPO API docs} + +\href{https://github.com/obophenotype/human-phenotype-ontology/issues/10232}{HPO GitHub Issue} +} +\usage{ +map_disease( + dat, + id_col = "disease_id", + fields = c("disease", "categories", "genes")[1], + use_api = TRUE, + return_dat = FALSE, + workers = NULL, + all.x = TRUE, + allow.cartesian = FALSE +) +} +\arguments{ +\item{dat}{A data.table with a column of disease IDs.} + +\item{id_col}{The name of the column with the disease IDs.} + +\item{fields}{The fields to extract from the API response.} + +\item{use_api}{Use the API to get the disease names and descriptions. +Otherwise, use a cached data.table (\code{disease_map}).} + +\item{return_dat}{Return the data.table with the mapped fields.} + +\item{workers}{Number (>1) or proportion (<1) of worker cores to use.} + +\item{all.x}{logical; if \code{TRUE}, rows from \code{x} which have no matching row +in \code{y} are included. These rows will have 'NA's in the columns that are usually +filled with values from \code{y}. The default is \code{FALSE} so that only rows with +data from both \code{x} and \code{y} are included in the output.} + +\item{allow.cartesian}{See \code{allow.cartesian} in \code{\link[data.table]{[.data.table}}.} +} +\description{ +Map disease IDs (e.g. "OMIM:101200") to names (e.g. "Apert syndrome") +} +\examples{ +dat <- HPOExplorer::load_phenotype_to_genes() +out <- map_disease(dat = dat, workers=1) +} diff --git a/tests/testthat/test-add_disease.R b/tests/testthat/test-add_disease.R index c917499..1f50b69 100644 --- a/tests/testthat/test-add_disease.R +++ b/tests/testthat/test-add_disease.R @@ -1,7 +1,7 @@ test_that("add_disease works", { phenos <- example_phenos() - phenos2 <- add_disease(phenos = phenos, add_definitions = TRUE) + phenos2 <- add_disease(phenos = phenos) testthat::expect_true( all(phenos$hpo_id %in% phenos2$hpo_id) )