HMDB Clean

#read current release information to set parameters for download source(here::here("code", "current_release.R")) #LOAD BIG data temp <- tempfile() download.file(metabolites_url, temp) hmdb_full <- xmlToDataFrame(unzip(temp)) %>% #extracting XML took a long time for a 4GB XML file! janitor::clean_names() unlink(temp) unlink("hmdb_metabolites.xml") #hmdb_full <- xmlToDataFrame(here::here("tmp", "hmdb_metabolites.xml")) %>% janitor::clean_names() #for getting metabolite classes class_finder <- function(string) { if(stringr::str_detect(string, "class\\b")) { class_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "class"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "class of organic compounds known as ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(class_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "classified\\b")) { classified_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "classified"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "classified as a member of the ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(classified_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "steroid")) { return("Steroid hormone") } else if(stringr::str_detect(string, "nucleoside")) { return("Nucleoside") } else if(stringr::str_detect(string, "acid")) { return("Organic acid") } else { return("Organic compound") } } #purrr::map(hmdb_class$description, ~ class_finder(.)) hmdb_class <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, description) %>% dplyr::mutate(class = purrr::map(description, ~ class_finder(.))) %>% dplyr::select(-description) hmdb_names <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, synonyms, cid = pubchem_compound_id) %>% dplyr::left_join(hmdb_class, by = "name") hmdb_meta <- hmdb_full %>% dplyr::select(name, synonyms, cid = pubchem_compound_id, accession, description, chemical_formula, average_molecular_weight, wikipedia_id) %>% dplyr::left_join(hmdb_class, by = "name") temp <- tempfile() download.file(metabolite_proteins_url, temp) hmdb_proteins_raw <- XML::xmlToDataFrame(unzip(temp)) %>% janitor::clean_names() unlink(temp) unlink("hmdb_proteins.xml") censor_proteins <- c("") censor_metabolites <- c("","Water", "Hydrogen Ion") hmdb_proteins_long <- hmdb_proteins_raw %>% #dplyr::slice(1:20) %>% #for testing dplyr::mutate(metabolite_associations = stringr::str_replace_all(metabolite_associations, "HMDB", "\\.HMDB")) %>% tidyr::separate_rows(metabolite_associations, sep = "\\.") %>% dplyr::filter(metabolite_associations != "") %>% tidyr::separate(metabolite_associations, into = c("metabolite_accession", "metabolite_name"), sep = 11) %>% dplyr::select(gene_name, metabolite_name, gene_accession = accession, metabolite_accession) %>% dplyr::filter(!gene_name %in% censor_proteins, !metabolite_name %in% censor_metabolites, stringr::str_detect(metabolite_accession, "HMDB")) # tmp <- "Water" # tmp <- "Glucose-6-Phosphate" # tmp <- "3-Carbamoyl-2-phenylpropionaldehyde" # tmp <- "-1(11),7,9-trien-11-ol" # metabolite_string <- "R-95913" #collapse metabolite name if it's long collapse_metabolites <- function(metabolite_string) { # #skip if missing # if(stringr::str_length(metabolite_string) == 0 | is.na(metabolite_string)) { # return(metabolite_string) # } #skip if too short if(stringr::str_length(metabolite_string) < 7) { return(metabolite_string) } #get first word new_metabolite_string <- stringr::str_extract(metabolite_string, "[[:alpha:]]\\w+") #alpha omits numbers and punct, w gets word character, + gets one or more #if new_string is NA b/c it has a bizarre name (like a drug name), code breaks; so return original string if(is.na(new_metabolite_string)) { return(metabolite_string) } #add plus to indicate that it got collapsed if(metabolite_string != new_metabolite_string) { new_metabolite_string <- glue::glue("{new_metabolite_string} +")} return(new_metabolite_string) } # collapse_metabolites(tmp) # simplify_metabolites <- function(df) { # # if (nrow(df) < 20) { # # return(df) # # } # new_df <- # df %>% # mutate(metabolite_name_simple = map_chr(.x = df[[2]], .f = collapse_metabolites)) # return(new_df) # } hmdb_proteins_full <- hmdb_proteins_long %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(original_num_rows = map_int(data, nrow)) hmdb_proteins <- hmdb_proteins_long %>% dplyr::mutate(metabolite_name_simple = map_chr(metabolite_name, collapse_metabolites)) %>% dplyr::distinct(gene_name, metabolite_name_simple, .keep_all = TRUE) %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::left_join(hmdb_proteins_full, by = "fav_gene", suffix = c("_collapsed", "_original")) %>% dplyr::arrange(desc(num_rows)) #colnames(hmdb_proteins) are fav_gene, data_collapsed, num_rows, data_original, original_num_rows hmdb_metabolites <- hmdb_proteins_long %>% #dplyr::mutate(metabolite_name = stringr::str_to_lower(metabolite_name)) %>% dplyr::group_by(fav_metabolite = metabolite_name) %>% tidyr::nest() %>% dplyr::filter(!fav_metabolite %in% censor_metabolites) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::arrange(desc(num_rows)) #save files #saveRDS(hmdb_full, file = here::here("data", paste0(release, "_hmdb_full.Rds"))) saveRDS(hmdb_names, file = here::here("data", paste0(release, "_hmdb_names.Rds"))) saveRDS(hmdb_meta, file = here::here("data", paste0(release, "_hmdb_meta.Rds"))) saveRDS(hmdb_proteins, file = here::here("data", paste0(release, "_hmdb_proteins.Rds"))) saveRDS(hmdb_metabolites, file = here::here("data", paste0(release, "_hmdb_metabolites.Rds")))

This is a cleaned and condensed version of the HMDB XML metabolites file from https://hmdb.ca/downloads, current version 5.0. The original zipped file (~1GB) extracts to 6.49 GB xml file. The full is then cleaned and then only small sub-files are exported. This is too big to extract from the zipped file on the server, and gets clipped (to about 4GB). As a result, the XML to DF function fails because of missing data. This repo containing 4 files is an alternative pointer for only the data needed. Code to process it from the hmdb_full file is pasted in the notes below. These 4 files are used on www.datadrivenhypothesis.org

Impact byBIP!

	selected citations These citations are derived from selected sources. This is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).	0
	popularity This indicator reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network.	Average
	influence This indicator reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).	Average
	impulse This indicator reflects the initial momentum of an article directly after its publication, based on the underlying citation network.	Average

Usage byUsageCounts

visibility

views

5

5
views
Powered by

Found an issue? Give us feedback

visibility

0

Average

5