update installation instructions [skip ci]

neurogenomics · Mar 8, 2024 · e0c2a3f · e0c2a3f
1 parent 17ad138
commit e0c2a3f
Show file tree

Hide file tree

Showing 12 changed files with 123 additions and 79 deletions.
diff --git a/R/get_gencc.R b/R/get_gencc.R
@@ -63,11 +63,17 @@ get_gencc <- function(agg_by=c("disease_id",
   #                to = c("OMIM","DECIPHER","Orphanet"))
   #### Encode evidence numerically ####
   d[,evidence_score:=dict[classification_title]]
+  #### Report ####
+  messager("Evidence scores for:",
+           "\n -",length(unique(d$disease_id)),"diseases",
+           "\n -",length(unique(d$gene_symbol)),"genes")
   #### Aggregate so that there's 1 entry/gene/disease ####
   if(!is.null(agg_by)){
     d <- d[,list(evidence_score_min=min(evidence_score, na.rm = TRUE),
                  evidence_score_max=max(evidence_score, na.rm = TRUE),
-                 evidence_score_mean=mean(evidence_score, na.rm=TRUE)),
+                 evidence_score_mean=mean(evidence_score, na.rm=TRUE),
+                 evidence_score_sum=sum(evidence_score, na.rm=TRUE),
+                 evidence_score_sd=sd(evidence_score, na.rm=TRUE)),
                by=agg_by]
   }
   #### Add version ####

diff --git a/R/get_ontology.R b/R/get_ontology.R
@@ -48,6 +48,12 @@ get_ontology <- function(name=c("mondo",
   name <- name[1]
   method <- match.arg(method) 
 
+  save_path <- file.path(save_dir, paste0(name,".rds"))
+  if(isFALSE(force_new) && file.exists(save_path)){
+    messager("Loading cached ontology:",save_path)
+    ont <- readRDS(save_path)
+    return(ont)
+  } 
   ol <- rols::Ontologies()
   rols_opts <- get_ols_options(ol=ol)
   if(method=="rols" && 
@@ -111,7 +117,13 @@ get_ontology <- function(name=c("mondo",
   }
   #### Subset ontology ####
   ont <- filter_ontology(ont = ont, 
-                         terms = terms) 
+                         terms = terms)
+  #### Cache RDS object ####
+  if(!is.null(save_dir)){
+    dir.create(save_dir, recursive=TRUE, showWarnings = FALSE)
+    messager("Saving ontology -->",save_path)
+    saveRDS(ont, save_path)
+  }
   #### Return ####
   return(ont)
 }
diff --git a/R/map_upheno_data_i.R b/R/map_upheno_data_i.R
@@ -88,20 +88,21 @@ map_upheno_data_i <- function(pheno_map_method,
       all.y = keep_nogenes,
       suffixes = c(1,2),
       allow.cartesian = TRUE
-    )
+    ) 
+    pheno_map_genes[,db2:=id2_db]
     ## Fill in missing species for those without gene data
-    # pheno_map_genes[
-    #   is.na(gene_taxon_label1),
-    #   gene_taxon_label1:=species_map[db1]$subject_taxon_label]
-    # pheno_map_genes[
-    #   is.na(gene_taxon_label2),
-    #   gene_taxon_label2:=species_map[db2]$subject_taxon_label]
+    pheno_map_genes[
+      is.na(gene_taxon_label1),
+      gene_taxon_label1:=species_map[db1]$subject_taxon_label]
+    pheno_map_genes[
+      is.na(gene_taxon_label2),
+      gene_taxon_label2:=species_map[db2]$subject_taxon_label]
     ## Add gene counts
-    # pheno_map_genes[,n_genes_db1:=data.table::uniqueN(gene_label1), by="id1"]
-    # pheno_map_genes[,n_genes_db2:=data.table::uniqueN(gene_label2), by="id2"]
+    pheno_map_genes[,n_genes_db1:=data.table::uniqueN(gene_label1), by="id1"]
+    pheno_map_genes[,n_genes_db2:=data.table::uniqueN(gene_label2), by="id2"]
     ## Report
-    messager(data.table::uniqueN(pheno_map_genes$subject_taxon_label2),"/",
-             data.table::uniqueN(genes_homol$subject_taxon_label),
+    messager(data.table::uniqueN(pheno_map_genes$gene_taxon_label2),"/",
+             data.table::uniqueN(genes_homol$gene_taxon_label),
              "species remain after cross-species phenotype mapping.")
     ## Remove
     # remove(genes_human,genes_nonhuman,pheno_map)
@@ -116,18 +117,18 @@ map_upheno_data_i <- function(pheno_map_method,
     }
     pheno_map_genes_match <-
       pheno_map_genes_match[,
-                            list(n_genes_intersect=data.table::uniqueN(hgnc_id2)),
-                            by=c("id1","db1","label1","n_genes_db1",
-                                 "id2","db2","label2","n_genes_db2",
-                                 "subject_taxon1","subject_taxon_label1",
-                                 "subject_taxon2","subject_taxon_label2",
+                            list(n_genes_intersect=data.table::uniqueN(hgnc2)),
+                            by=c("id1","db1","object_label1","n_genes_db1",
+                                 "id2","db2","object_label2","n_genes_db2",
+                                 "gene_taxon1","gene_taxon_label1",
+                                 "gene_taxon2","gene_taxon_label2",
                                  "equivalence_score","subclass_score")
       ] |>
       data.table::setorderv("n_genes_intersect",-1)
     pheno_map_genes_match[,n_phenotypes:=data.table::uniqueN(id1),
                           by=c("db1","db2",
-                               "subject_taxon1","subject_taxon2",
-                               "subject_taxon_label1","subject_taxon_label2"
+                               "gene_taxon1","gene_taxon2",
+                               "gene_taxon_label1","gene_taxon_label2"
                           )]
     pheno_map_genes_match[,prop_intersect:=(n_genes_intersect/n_genes_db1)]
     ## Compute a score that captures both the phenotype mapping score and

diff --git a/R/plot_upheno.R b/R/plot_upheno.R
@@ -17,27 +17,25 @@
 #'   pheno_map_genes_match = pheno_map_genes_match)
 #' }
 plot_upheno <- function(pheno_map_genes_match=NULL,
-                        subset_db1="HP",
+                        filters=list(db1="HP",
+                                     gene_taxon_label1="Homo sapiens"),
                         types=c("rainplot","scatterplot","heatmap")){
   ## Prepare plot data
-  n_phenotypes <- object_db <- subject <-NULL;
+  id1 <- n_phenotypes <-NULL;
   {
     #### Subset data ####
-    if(!is.null(subset_db1)) {
-      plot_dat <- pheno_map_genes_match[object_db %in% subset_db1,]
-      messager(data.table::uniqueN(plot_dat$subject_taxon_label2),"/",
-               data.table::uniqueN(pheno_map_genes_match$subject_taxon_label2),
-               "species remain after filtering by `subset_db1`.")
-    }else {
-      plot_dat <- data.table::copy(pheno_map_genes_match)
-    }
+    plot_dat <- filter_dt(pheno_map_genes_match, 
+                          filters = filters)
+    messager(data.table::uniqueN(plot_dat$gene_taxon_label2),"/",
+             data.table::uniqueN(pheno_map_genes_match$gene_taxon_label2),
+             "species remain after filtering by `subset_db1`.")
     ## Only use one phenotype-phenotype per row
-    plot_dat <- plot_dat[,.SD[1],by=c("object_db","subject","object_db","object")]
+    plot_dat <- plot_dat[,.SD[1],by=c("db1","id1","db2","id2")]
     ## Recompute n_phenotypes
-    plot_dat[,n_phenotypes:=data.table::uniqueN(subject),
-             by=c("object_db","object_db",
-                  "subject_taxon1","subject_taxon2",
-                  "subject_taxon_label1","subject_taxon_label2"
+    plot_dat[,n_phenotypes:=data.table::uniqueN(id1),
+             by=c("db1","db2",
+                  "gene_taxon1","gene_taxon2",
+                  "gene_taxon_label1","gene_taxon_label2"
              )]
   }
   plots <- list()

diff --git a/R/plot_upheno_heatmap.R b/R/plot_upheno_heatmap.R
@@ -1,31 +1,45 @@
 #' @describeIn plot_ plot_
 plot_upheno_heatmap <- function(plot_dat,
-                               ont,
+                               ont=get_ontology("upheno",
+                                                add_ancestors = 10),
                                hpo_ids=NULL,
                                value.var=c("phenotype_genotype_score",
                                            "prop_intersect",
                                            "equivalence_score",
                                            "subclass_score"),
+                               name=value.var[1],
                                min_rowsums=NULL,
                                cluster_from_ontology=FALSE,
                                save_dir=tempdir(),
                                height = 15,
                                width = 10){
   requireNamespace("ComplexHeatmap")
-  subject <- hpo_id <- label1 <- NULL;
+  hpo_id <- object_label1 <- NULL;
   set.seed(2023)
   value.var <- match.arg(value.var)
+  value.var <- value.var[1]
   # hpo_ids <- MultiEWCE::example_targets$top_targets$hpo_id
 
   ### Subset phenotypes
-  if(!is.null(hpo_ids)) plot_dat <- plot_dat[subject %in% unique(hpo_ids)]
-  plot_dat[,hpo_id:=subject][,label1:=gsub(" (HPO)","",label1,fixed = TRUE)]
-  plot_dat <- add_ancestors(plot_dat)
-  data.table::setkeyv(plot_dat,"label1")
+  if(!is.null(hpo_ids)) plot_dat <- plot_dat[id1 %in% unique(hpo_ids)]
+  plot_dat[,hpo_id:=id1][,object_label1:=gsub(" (HPO)","",
+                                              object_label1,fixed = TRUE)]
+  if(isFALSE(cluster_from_ontology)){
+    ont <- add_ancestors(ont)
+    if(!"ancestor_name" %in% names(plot_dat)){
+      plot_dat <- merge(
+        plot_dat,
+        ont@elementMetadata[,c("short_id","ancestor_name")],
+        by.x="id1",
+        by.y="short_id"
+      )
+    } 
+  }
+  data.table::setkeyv(plot_dat,"object_label1")
 
   ### Plot
   X <- data.table::dcast.data.table(plot_dat,
-                                    formula = label1 ~ subject_taxon_label2,
+                                    formula = object_label1 ~ gene_taxon_label2,
                                     fill = 0,
                                     value.var = value.var,
                                     fun.aggregate = mean,
@@ -39,33 +53,43 @@ plot_upheno_heatmap <- function(plot_dat,
 
   #### Get clusters from ontology ####
   if(isTRUE(cluster_from_ontology)){
+    nms <- map_ontology_terms(ont = ont,
+                              terms = plot_dat$id1,
+                              to = "name",
+                              keep_order = FALSE)
+    nms <- intersect(nms, rownames(X))
     ids <- map_ontology_terms(ont = ont,
-                              terms = plot_dat$subject,
+                              terms =nms,
+                              to = "id",
                               keep_order = FALSE)
-    ids <- ids[ids %in% rownames(X)]
-    ## best to do this on the entire HPO, then subset
-    cluster_rows <- ontology_to(to="igraph_dist_hclust",
-                                terms = names(ids))
+    ## best to do this on the entire HPO, then subset 
+    gd <- ontology_to(ont=ont,
+                      to="igraph_dist",
+                      terms = unname(ids))
+    cluster_rows <- stats::hclust(d = as.dist(gd[unname(ids),unname(ids)]))
+    # cluster_rows[cluster_rows$labels[1:10]]
     # leaves <- dendextend::get_leaves_attr(cluster_rows,"label")
     # c2 <- dendextend::prune(cluster_rows,
     #                         leaves[!leaves %in% names(ids)],
     #                         keep_branches = FALSE)
     ## subset to only those in the heatmap
-    X <- X[ids,]
+    X <- X[nms,]
   } else {
     cluster_rows <- TRUE
   }
   #### Add annotation ####
+  cols <- c("object_label1","n_genes_db1",
+            "ancestor_name")
+  cols <- cols[cols %in% names(plot_dat)]
   annot_dat <- plot_dat[rownames(X),
-                        c("label1","n_genes_db1",
-                          "ancestor_name")] |> unique()
+                        cols, with=FALSE] |> unique()
   # col_fun <- colorRamp2::colorRamp2(
   #   seq(min(annot_dat$n_genes_db1),
   #       max(annot_dat$n_genes_db1),
   #       length.out=4),
   #   pals::gnuplot(4))
   la <- ComplexHeatmap::rowAnnotation(
-    df=data.frame(annot_dat[,-c("label1")],
+    df=data.frame(annot_dat[,-c("object_label1")],
                   row.names = annot_dat$label1),
     # ?ComplexHeatmap::Legend
     show_legend = c(TRUE, FALSE),
@@ -75,7 +99,7 @@ plot_upheno_heatmap <- function(plot_dat,
   row_split <- if(isFALSE(cluster_from_ontology)) annot_dat$ancestor_name
   #### make heatmap ####
   ch <- ComplexHeatmap::Heatmap(matrix = as.matrix(X),
-                                name = value.var,
+                                name = name,
                                 cluster_rows = cluster_rows,
                                 row_title_rot = 0,
                                 row_names_gp = grid::gpar(fontsize = 7),

diff --git a/R/plot_upheno_rainplot.R b/R/plot_upheno_rainplot.R
@@ -9,11 +9,11 @@ plot_upheno_rainplot <- function(plot_dat){
 
   ### Plot proportion of intersecting orthologs per ontology ####
   ggplot2::ggplot(plot_dat,
-         ggplot2::aes(x=paste0(subject_taxon_label2,
+         ggplot2::aes(x=paste0(gene_taxon_label2,
                       "\n(n = ",n_phenotypes," phenotypes)"),
              y=(n_genes_intersect/n_genes_db1),
-             fill=factor(object_db))) +
-    ggplot2::facet_grid(object_db~.,
+             fill=factor(db2))) +
+    ggplot2::facet_grid(db2~.,
                scales = "free_y",
                space = "free_y") +
     # add half-violin from {ggdist} package

diff --git a/R/plot_upheno_scatterplot.R b/R/plot_upheno_scatterplot.R
@@ -8,6 +8,6 @@ plot_upheno_scatterplot <- function(plot_dat){
                   ggplot2::aes(x=(n_genes_intersect/n_genes_db1),
              y=equivalence_score)) +
     ggplot2::geom_point() +
-    ggplot2::facet_grid(rows = "subject_taxon_label2") +
+    ggplot2::facet_grid(rows = "gene_taxon_label2") +
     ggplot2::geom_smooth()
 }
diff --git a/README.Rmd b/README.Rmd
@@ -29,9 +29,9 @@ If you use ``r pkg``, please cite:
 ## Installation 
 
 ```R
-if(!require("remotes")) install.packages("remotes")
+if(!require("BiocManager")) install.packages("BiocManager")
 
-remotes::install_github("`r paste(owner,repo,sep='/')`")
+BiocManager::install("`r paste(owner,repo,sep='/')`")
 library(`r pkg`)
 ```
 ## Documentation 

diff --git a/README.md b/README.md
@@ -12,31 +12,33 @@ status](https://github.com/neurogenomics/KGExplorer/workflows/rworkflows/badge.s
 <br>
 <a href='https://app.codecov.io/gh/neurogenomics/KGExplorer/tree/master' target='_blank'><img src='https://codecov.io/gh/neurogenomics/KGExplorer/branch/master/graphs/icicle.svg' title='Codecov icicle graph' width='200' height='50' style='vertical-align: top;'></a>  
 <h4>  
-Authors: <i>yourGivenName yourFamilyName</i>  
+Authors: <i>Brian Schilder</i>  
 </h4>
 <h4>  
-README updated: <i>Dec-09-2023</i>  
+README updated: <i>Mar-08-2024</i>  
 </h4>
 
 <!-- To modify Package/Title/Description/Authors fields, edit the DESCRIPTION file -->
 
 ## `KGExplorer`: Biomedical Knowledge Network Construction and Analysis
 
-### Efficiently construct large-scale knowledge networks from biomedical repositories.
+### Query, construct, and analyse large-scale biomedical knowledge graphs and ontologies.
 
 If you use `KGExplorer`, please cite:
 
 <!-- Modify this by editing the file: inst/CITATION  -->
 
-> author1, author2, author3 (publicationYear) articleTitle,
-> *journalName*; volumeNumber, [linkToPublication](linkToPublication)
+> Kitty B. Murphy, Robert Gordon-Smith, Jai Chapman, Momoko Otani, Brian
+> M. Schilder, Nathan G. Skene (2023) Identification of cell
+> type-specific gene targets underlying thousands of rare diseases and
+> subtraits. medRxiv, <https://doi.org/10.1101/2023.02.13.23285820>
 
 ## Installation
 
 ``` r
-if(!require("remotes")) install.packages("remotes")
+if(!require("BiocManager")) install.packages("BiocManager")
 
-remotes::install_github("neurogenomics/KGExplorer")
+BiocManager::install("neurogenomics/KGExplorer")
 library(KGExplorer)
 ```
 
@@ -58,7 +60,7 @@ utils::sessionInfo()
 
     ## R version 4.3.1 (2023-06-16)
     ## Platform: aarch64-apple-darwin20 (64-bit)
-    ## Running under: macOS Sonoma 14.1.1
+    ## Running under: macOS Sonoma 14.3.1
     ## 
     ## Matrix products: default
     ## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
@@ -77,17 +79,17 @@ utils::sessionInfo()
     ##  [1] gtable_0.3.4        jsonlite_1.8.8      renv_1.0.3         
     ##  [4] dplyr_1.1.4         compiler_4.3.1      BiocManager_1.30.22
     ##  [7] tidyselect_1.2.0    rvcheck_0.2.1       scales_1.3.0       
-    ## [10] yaml_2.3.7          fastmap_1.1.1       here_1.0.1         
+    ## [10] yaml_2.3.8          fastmap_1.1.1       here_1.0.1         
     ## [13] ggplot2_3.4.4       R6_2.5.1            generics_0.1.3     
-    ## [16] knitr_1.45          yulab.utils_0.1.0   tibble_3.2.1       
-    ## [19] desc_1.4.2          dlstats_0.1.7       rprojroot_2.0.4    
+    ## [16] knitr_1.45          yulab.utils_0.1.4   tibble_3.2.1       
+    ## [19] desc_1.4.3          dlstats_0.1.7       rprojroot_2.0.4    
     ## [22] munsell_0.5.0       pillar_1.9.0        RColorBrewer_1.1-3 
-    ## [25] rlang_1.1.2         utf8_1.2.4          cachem_1.0.8       
-    ## [28] badger_0.2.3        xfun_0.41           fs_1.6.3           
-    ## [31] memoise_2.0.1.9000  cli_3.6.1           magrittr_2.0.3     
-    ## [34] rworkflows_1.0.2    digest_0.6.33       grid_4.3.1         
+    ## [25] rlang_1.1.3         utf8_1.2.4          cachem_1.0.8       
+    ## [28] badger_0.2.3        xfun_0.42           fs_1.6.3           
+    ## [31] memoise_2.0.1.9000  cli_3.6.2           magrittr_2.0.3     
+    ## [34] rworkflows_1.0.1    digest_0.6.34       grid_4.3.1         
     ## [37] rstudioapi_0.15.0   lifecycle_1.0.4     vctrs_0.6.5        
-    ## [40] data.table_1.14.10  evaluate_0.23       glue_1.6.2         
+    ## [40] data.table_1.15.0   evaluate_0.23       glue_1.7.0         
     ## [43] fansi_1.0.6         colorspace_2.1-0    rmarkdown_2.25     
     ## [46] tools_4.3.1         pkgconfig_2.0.3     htmltools_0.5.7
 

diff --git a/man/get_.Rd b/man/get_.Rd
diff --git a/man/plot_.Rd b/man/plot_.Rd