Powered by OpenAIRE graph
Found an issue? Give us feedback
image/svg+xml art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos Open Access logo, converted into svg, designed by PLoS. This version with transparent background. http://commons.wikimedia.org/wiki/File:Open_Access_logo_PLoS_white.svg art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos http://www.plos.org/ ZENODOarrow_drop_down
image/svg+xml art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos Open Access logo, converted into svg, designed by PLoS. This version with transparent background. http://commons.wikimedia.org/wiki/File:Open_Access_logo_PLoS_white.svg art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos http://www.plos.org/
ZENODO
Dataset . 2021
License: CC BY
Data sources: Datacite
image/svg+xml art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos Open Access logo, converted into svg, designed by PLoS. This version with transparent background. http://commons.wikimedia.org/wiki/File:Open_Access_logo_PLoS_white.svg art designer at PLoS, modified by Wikipedia users Nina, Beao, JakobVoss, and AnonMoos http://www.plos.org/
ZENODO
Dataset . 2021
License: CC BY
Data sources: Datacite
versions View all 2 versions
addClaim

HMDB Clean

Authors: Hirschey, Matthew;

HMDB Clean

Abstract

#read current release information to set parameters for download source(here::here("code", "current_release.R")) #LOAD BIG data temp <- tempfile() download.file(metabolites_url, temp) hmdb_full <- xmlToDataFrame(unzip(temp)) %>% #extracting XML took a long time for a 4GB XML file! janitor::clean_names() unlink(temp) unlink("hmdb_metabolites.xml") #hmdb_full <- xmlToDataFrame(here::here("tmp", "hmdb_metabolites.xml")) %>% janitor::clean_names() #for getting metabolite classes class_finder <- function(string) { if(stringr::str_detect(string, "class\\b")) { class_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "class"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "class of organic compounds known as ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(class_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "classified\\b")) { classified_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "classified"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "classified as a member of the ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(classified_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "steroid")) { return("Steroid hormone") } else if(stringr::str_detect(string, "nucleoside")) { return("Nucleoside") } else if(stringr::str_detect(string, "acid")) { return("Organic acid") } else { return("Organic compound") } } #purrr::map(hmdb_class$description, ~ class_finder(.)) hmdb_class <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, description) %>% dplyr::mutate(class = purrr::map(description, ~ class_finder(.))) %>% dplyr::select(-description) hmdb_names <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, synonyms, cid = pubchem_compound_id) %>% dplyr::left_join(hmdb_class, by = "name") hmdb_meta <- hmdb_full %>% dplyr::select(name, synonyms, cid = pubchem_compound_id, accession, description, chemical_formula, average_molecular_weight, wikipedia_id) %>% dplyr::left_join(hmdb_class, by = "name") temp <- tempfile() download.file(metabolite_proteins_url, temp) hmdb_proteins_raw <- XML::xmlToDataFrame(unzip(temp)) %>% janitor::clean_names() unlink(temp) unlink("hmdb_proteins.xml") censor_proteins <- c("") censor_metabolites <- c("","Water", "Hydrogen Ion") hmdb_proteins_long <- hmdb_proteins_raw %>% #dplyr::slice(1:20) %>% #for testing dplyr::mutate(metabolite_associations = stringr::str_replace_all(metabolite_associations, "HMDB", "\\.HMDB")) %>% tidyr::separate_rows(metabolite_associations, sep = "\\.") %>% dplyr::filter(metabolite_associations != "") %>% tidyr::separate(metabolite_associations, into = c("metabolite_accession", "metabolite_name"), sep = 11) %>% dplyr::select(gene_name, metabolite_name, gene_accession = accession, metabolite_accession) %>% dplyr::filter(!gene_name %in% censor_proteins, !metabolite_name %in% censor_metabolites, stringr::str_detect(metabolite_accession, "HMDB")) # tmp <- "Water" # tmp <- "Glucose-6-Phosphate" # tmp <- "3-Carbamoyl-2-phenylpropionaldehyde" # tmp <- "-1(11),7,9-trien-11-ol" # metabolite_string <- "R-95913" #collapse metabolite name if it's long collapse_metabolites <- function(metabolite_string) { # #skip if missing # if(stringr::str_length(metabolite_string) == 0 | is.na(metabolite_string)) { # return(metabolite_string) # } #skip if too short if(stringr::str_length(metabolite_string) < 7) { return(metabolite_string) } #get first word new_metabolite_string <- stringr::str_extract(metabolite_string, "[[:alpha:]]\\w+") #alpha omits numbers and punct, w gets word character, + gets one or more #if new_string is NA b/c it has a bizarre name (like a drug name), code breaks; so return original string if(is.na(new_metabolite_string)) { return(metabolite_string) } #add plus to indicate that it got collapsed if(metabolite_string != new_metabolite_string) { new_metabolite_string <- glue::glue("{new_metabolite_string} +")} return(new_metabolite_string) } # collapse_metabolites(tmp) # simplify_metabolites <- function(df) { # # if (nrow(df) < 20) { # # return(df) # # } # new_df <- # df %>% # mutate(metabolite_name_simple = map_chr(.x = df[[2]], .f = collapse_metabolites)) # return(new_df) # } hmdb_proteins_full <- hmdb_proteins_long %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(original_num_rows = map_int(data, nrow)) hmdb_proteins <- hmdb_proteins_long %>% dplyr::mutate(metabolite_name_simple = map_chr(metabolite_name, collapse_metabolites)) %>% dplyr::distinct(gene_name, metabolite_name_simple, .keep_all = TRUE) %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::left_join(hmdb_proteins_full, by = "fav_gene", suffix = c("_collapsed", "_original")) %>% dplyr::arrange(desc(num_rows)) #colnames(hmdb_proteins) are fav_gene, data_collapsed, num_rows, data_original, original_num_rows hmdb_metabolites <- hmdb_proteins_long %>% #dplyr::mutate(metabolite_name = stringr::str_to_lower(metabolite_name)) %>% dplyr::group_by(fav_metabolite = metabolite_name) %>% tidyr::nest() %>% dplyr::filter(!fav_metabolite %in% censor_metabolites) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::arrange(desc(num_rows)) #save files #saveRDS(hmdb_full, file = here::here("data", paste0(release, "_hmdb_full.Rds"))) saveRDS(hmdb_names, file = here::here("data", paste0(release, "_hmdb_names.Rds"))) saveRDS(hmdb_meta, file = here::here("data", paste0(release, "_hmdb_meta.Rds"))) saveRDS(hmdb_proteins, file = here::here("data", paste0(release, "_hmdb_proteins.Rds"))) saveRDS(hmdb_metabolites, file = here::here("data", paste0(release, "_hmdb_metabolites.Rds")))

This is a cleaned and condensed version of the HMDB XML metabolites file from https://hmdb.ca/downloads, current version 5.0. The original zipped file (~1GB) extracts to 6.49 GB xml file. The full is then cleaned and then only small sub-files are exported. This is too big to extract from the zipped file on the server, and gets clipped (to about 4GB). As a result, the XML to DF function fails because of missing data. This repo containing 4 files is an alternative pointer for only the data needed. Code to process it from the hmdb_full file is pasted in the notes below. These 4 files are used on www.datadrivenhypothesis.org

  • BIP!
    Impact byBIP!
    selected citations
    These citations are derived from selected sources.
    This is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).
    0
    popularity
    This indicator reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network.
    Average
    influence
    This indicator reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).
    Average
    impulse
    This indicator reflects the initial momentum of an article directly after its publication, based on the underlying citation network.
    Average
    OpenAIRE UsageCounts
    Usage byUsageCounts
    visibility views 5
  • 5
    views
    Powered byOpenAIRE UsageCounts
Powered by OpenAIRE graph
Found an issue? Give us feedback
visibility
selected citations
These citations are derived from selected sources.
This is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).
BIP!Citations provided by BIP!
popularity
This indicator reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network.
BIP!Popularity provided by BIP!
influence
This indicator reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically).
BIP!Influence provided by BIP!
impulse
This indicator reflects the initial momentum of an article directly after its publication, based on the underlying citation network.
BIP!Impulse provided by BIP!
views
OpenAIRE UsageCountsViews provided by UsageCounts
0
Average
Average
Average
5