rentrez/0000755000176200001440000000000013240156221011737 5ustar liggesusersrentrez/inst/0000755000176200001440000000000013240146763012726 5ustar liggesusersrentrez/inst/CITATION0000644000176200001440000000101713237165621014062 0ustar liggesusers bibentry( header = "To cite rentrez in publications use:", bibtype = "Article", title = "{rentrez}: an R package for the NCBI eUtils API", author = as.person("David J. Winter"), journal = "The R Journal", year = "2017", volume = "9", issue = "2", pages = "520--526", textVersion = paste("Winter, D. J. (2017)", "rentrez: an R package for the NCBI eUtils API", "The R Journal 9(2):520-526") ) rentrez/inst/doc/0000755000176200001440000000000013240146763013473 5ustar liggesusersrentrez/inst/doc/rentrez_tutorial.R0000644000176200001440000001661413240146763017242 0ustar liggesusers## ---- count_recs, echo=FALSE--------------------------------------------- library(rentrez) count_recs <- function(db, denom) { nrecs <- rentrez::entrez_db_summary(db)["Count"] round(as.integer(nrecs)/denom, 1) } ## ---- dbs---------------------------------------------------------------- entrez_dbs() ## ---- cdd---------------------------------------------------------------- entrez_db_summary("cdd") ## ---- sra_eg------------------------------------------------------------- entrez_db_searchable("sra") ## ----eg_search----------------------------------------------------------- r_search <- entrez_search(db="pubmed", term="R Language") ## ----print_search-------------------------------------------------------- r_search ## ----search_ids---------------------------------------------------------- r_search$ids ## ----searchids_2--------------------------------------------------------- another_r_search <- entrez_search(db="pubmed", term="R Language", retmax=40) another_r_search ## ---- Tt----------------------------------------------------------------- entrez_search(db="sra", term="Tetrahymena thermophila[ORGN]", retmax=0) ## ---- Tt2---------------------------------------------------------------- entrez_search(db="sra", term="Tetrahymena thermophila[ORGN] AND 2013:2015[PDAT]", retmax=0) ## ---- Tt3---------------------------------------------------------------- entrez_search(db="sra", term="(Tetrahymena thermophila[ORGN] OR Tetrahymena borealis[ORGN]) AND 2013:2015[PDAT]", retmax=0) ## ---- sra_searchable----------------------------------------------------- entrez_db_searchable("sra") ## ---- mesh--------------------------------------------------------------- entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") ## ---- connectome, fig.width=5, fig.height=4, fig.align='center'---------- search_year <- function(year, term){ query <- paste(term, "AND (", year, "[PDAT])") entrez_search(db="pubmed", term=query, retmax=0)$count } year <- 2008:2014 papers <- sapply(year, search_year, term="Connectome", USE.NAMES=FALSE) plot(year, papers, type='b', main="The Rise of the Connectome") ## ----elink0-------------------------------------------------------------- all_the_links <- entrez_link(dbfrom='gene', id=351, db='all') all_the_links ## ----elink_link---------------------------------------------------------- all_the_links$links ## ---- elink_pmc---------------------------------------------------------- all_the_links$links$gene_pmc[1:10] ## ---- elink_omim--------------------------------------------------------- all_the_links$links$gene_clinvar ## ---- elink1------------------------------------------------------------- nuc_links <- entrez_link(dbfrom='gene', id=351, db='nuccore') nuc_links nuc_links$links ## ---- elinik_refseqs----------------------------------------------------- nuc_links$links$gene_nuccore_refseqrna ## ---- outlinks----------------------------------------------------------- paper_links <- entrez_link(dbfrom="pubmed", id=25500142, cmd="llinks") paper_links ## ---- urls--------------------------------------------------------------- paper_links$linkouts ## ----just_urls----------------------------------------------------------- linkout_urls(paper_links) ## ---- multi_default------------------------------------------------------ all_links_together <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646")) all_links_together all_links_together$links$gene_protein ## ---- multi_byid--------------------------------------------------------- all_links_sep <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646"), by_id=TRUE) all_links_sep lapply(all_links_sep, function(x) x$links$gene_protein) ## ---- Summ_1------------------------------------------------------------- taxize_summ <- entrez_summary(db="pubmed", id=24555091) taxize_summ ## ---- Summ_2------------------------------------------------------------- taxize_summ$articleids ## ---- Summ_3------------------------------------------------------------- taxize_summ$pmcrefcount ## ---- multi_summ--------------------------------------------------------- vivax_search <- entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") multi_summs <- entrez_summary(db="pubmed", id=vivax_search$ids) ## ---- multi_summ2-------------------------------------------------------- extract_from_esummary(multi_summs, "fulljournalname") ## ---- multi_summ3-------------------------------------------------------- date_and_cite <- extract_from_esummary(multi_summs, c("pubdate", "pmcrefcount", "title")) knitr::kable(head(t(date_and_cite)), row.names=FALSE) ## ---- transcript_ids----------------------------------------------------- gene_ids <- c(351, 11647) linked_seq_ids <- entrez_link(dbfrom="gene", id=gene_ids, db="nuccore") linked_transripts <- linked_seq_ids$links$gene_nuccore_refseqrna head(linked_transripts) ## ----fetch_fasta--------------------------------------------------------- all_recs <- entrez_fetch(db="nuccore", id=linked_transripts, rettype="fasta") class(all_recs) nchar(all_recs) ## ---- peak--------------------------------------------------------------- cat(strwrap(substr(all_recs, 1, 500)), sep="\n") ## ---- Tt_tax------------------------------------------------------------- Tt <- entrez_search(db="taxonomy", term="(Tetrahymena thermophila[ORGN]) AND Species[RANK]") tax_rec <- entrez_fetch(db="taxonomy", id=Tt$ids, rettype="xml", parsed=TRUE) class(tax_rec) ## ---- Tt_list------------------------------------------------------------ tax_list <- XML::xmlToList(tax_rec) tax_list$Taxon$GeneticCode ## ---- Tt_path------------------------------------------------------------ tt_lineage <- tax_rec["//LineageEx/Taxon/ScientificName"] tt_lineage[1:4] ## ---- Tt_apply----------------------------------------------------------- XML::xpathSApply(tax_rec, "//LineageEx/Taxon/ScientificName", XML::xmlValue) ## ---- asthma------------------------------------------------------------- upload <- entrez_post(db="omim", id=600807) upload ## ---- snail_search------------------------------------------------------- entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]") ## ---- snail_history------------------------------------------------------ snail_coi <- entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]", use_history=TRUE) snail_coi snail_coi$web_history ## ---- asthma_links------------------------------------------------------- asthma_clinvar <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", id=600807) asthma_clinvar$web_histories ## ---- asthma_links_upload------------------------------------------------ asthma_variants <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", web_history=upload) asthma_variants ## ---- links-------------------------------------------------------------- snp_links <- entrez_link(dbfrom="clinvar", db="snp", web_history=asthma_variants$web_histories$omim_clinvar, cmd="neighbor_history") snp_summ <- entrez_summary(db="snp", web_history=snp_links$web_histories$clinvar_snp) knitr::kable(extract_from_esummary(snp_summ, c("chr", "fxn_class", "global_maf"))) ## ---- set_key------------------------------------------------------------ set_entrez_key("ABCD123") Sys.getenv("ENTREZ_KEY") rentrez/inst/doc/rentrez_tutorial.html0000644000176200001440000025372313240146763020011 0ustar liggesusers Rentrez Tutorial

Rentrez Tutorial

David winter

2018-02-12

Introduction: The NCBI, entrez and rentrez.

The NCBI shares a lot of data. At the time this document was compiled, there were 28.1 million papers in PubMed, including 4.8 million full-text records available in PubMed Central. The NCBI Nucleotide Database (which includes GenBank) has data for 254.8 million different sequences, and dbSNP describes 1070.2 million different genetic variants. All of these records can be cross-referenced with the 1.32 million species in the NCBI taxonomy or 25.7 thousand disease-associated records in OMIM.

The NCBI makes this data available through a web interface, an FTP server and through a REST API called the Entrez Utilities (Eutils for short). This package provides functions to use that API, allowing users to gather and combine data from multiple NCBI databases in the comfort of an R session or script.

Getting started with the rentrez

To make the most of all the data the NCBI shares you need to know a little about their databases, the records they contain and the ways you can find those records. The NCBI provides extensive documentation for each of their databases and for the EUtils API that rentrez takes advantage of. There are also some helper functions in rentrez that help users learn their way around the NCBI’s databases.

First, you can use entrez_dbs() to find the list of available databases:

entrez_dbs()
##  [1] "pubmed"          "protein"         "nuccore"        
##  [4] "ipg"             "nucleotide"      "nucgss"         
##  [7] "nucest"          "structure"       "sparcle"        
## [10] "genome"          "annotinfo"       "assembly"       
## [13] "bioproject"      "biosample"       "blastdbinfo"    
## [16] "books"           "cdd"             "clinvar"        
## [19] "clone"           "gap"             "gapplus"        
## [22] "grasp"           "dbvar"           "gene"           
## [25] "gds"             "geoprofiles"     "homologene"     
## [28] "medgen"          "mesh"            "ncbisearch"     
## [31] "nlmcatalog"      "omim"            "orgtrack"       
## [34] "pmc"             "popset"          "probe"          
## [37] "proteinclusters" "pcassay"         "biosystems"     
## [40] "pccompound"      "pcsubstance"     "pubmedhealth"   
## [43] "seqannot"        "snp"             "sra"            
## [46] "taxonomy"        "biocollections"  "unigene"        
## [49] "gencoll"         "gtr"

There is a set of functions with names starting entrez_db_ that can be used to gather more information about each of these databases:

Functions that help you learn about NCBI databases

Function name Return
entrez_db_summary() Brief description of what the database is
entrez_db_searchable() Set of search terms that can used with this database
entrez_db_links() Set of databases that might contain linked records

For instance, we can get a description of the somewhat cryptically named database ‘cdd’…

entrez_db_summary("cdd")
##  DbName: cdd
##  MenuName: Conserved Domains
##  Description: Conserved Domain Database
##  DbBuild: Build170330-1240.1
##  Count: 56066
##  LastUpdate: 2017/03/31 15:56

… or find out which search terms can be used with the Sequence Read Archive (SRA) database (which contains raw data from sequencing projects):

entrez_db_searchable("sra")
## Searchable fields for database 'sra'
##   ALL     All terms from all searchable fields 
##   UID     Unique number assigned to publication 
##   FILT    Limits the records 
##   ACCN    Accession number of sequence 
##   TITL    Words in definition line 
##   PROP    Classification by source qualifiers and molecule type 
##   WORD    Free text associated with record 
##   ORGN    Scientific and common names of organism, and all higher levels of taxonomy 
##   AUTH    Author(s) of publication 
##   PDAT    Date sequence added to GenBank 
##   MDAT    Date of last update 
##   GPRJ    BioProject 
##   BSPL    BioSample 
##   PLAT    Platform 
##   STRA    Strategy 
##   SRC     Source 
##   SEL     Selection 
##   LAY     Layout 
##   RLEN    Percent of aligned reads 
##   ACS     Access is public or controlled 
##   ALN     Percent of aligned reads 
##   MBS     Size in megabases

Just how these ‘helper’ functions might be useful will become clearer once you’ve started using rentrez, so let’s get started.

Getting summary data: entrez_summary()

Having found the unique IDs for some records via entrez_search or entrez_link(), you are probably going to want to learn something about them. The Eutils API has two ways to get information about a record. entrez_fetch() returns ‘full’ records in varying formats and entrez_summary() returns less information about each record, but in relatively simple format. Very often the summary records have the information you are after, so rentrez provides functions to parse and summarise summary records.

The summary record

entrez_summary() takes a vector of unique IDs for the samples you want to get summary information from. Let’s start by finding out something about the paper describing Taxize, using its PubMed ID:

taxize_summ <- entrez_summary(db="pubmed", id=24555091)
taxize_summ
## esummary result with 42 items:
##  [1] uid               pubdate           epubdate         
##  [4] source            authors           lastauthor       
##  [7] title             sorttitle         volume           
## [10] issue             pages             lang             
## [13] nlmuniqueid       issn              essn             
## [16] pubtype           recordstatus      pubstatus        
## [19] articleids        history           references       
## [22] attributes        pmcrefcount       fulljournalname  
## [25] elocationid       doctype           srccontriblist   
## [28] booktitle         medium            edition          
## [31] publisherlocation publishername     srcdate          
## [34] reportnumber      availablefromurl  locationlabel    
## [37] doccontriblist    docdate           bookname         
## [40] chapter           sortpubdate       sortfirstauthor

Once again, the object returned by entrez_summary behaves like a list, so you can extract elements using $. For instance, we could convert our PubMed ID to another article identifier…

taxize_summ$articleids
##       idtype idtypen                           value
## 1     pubmed       1                        24555091
## 2        doi       3 10.12688/f1000research.2-191.v2
## 3        pmc       8                      PMC3901538
## 4        rid       8                        24563765
## 5        eid       8                        24555091
## 6    version       8                               2
## 7 version-id       8                               2
## 8      pmcid       5             pmc-id: PMC3901538;

…or see how many times the article has been cited in PubMed Central papers

taxize_summ$pmcrefcount
## [1] 13

Dealing with many records

If you give entrez_summary() a vector with more than one ID you’ll get a list of summary records back. Let’s get those Plasmodium vivax papers we found in the entrez_search() section back, and fetch some summary data on each paper:

vivax_search <- entrez_search(db = "pubmed",
                              term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])")
multi_summs <- entrez_summary(db="pubmed", id=vivax_search$ids)

rentrez provides a helper function, extract_from_esummary() that takes one or more elements from every summary record in one of these lists. Here it is working with one…

extract_from_esummary(multi_summs, "fulljournalname")
##                                                                                                                 29016333 
##                                                                  "The American journal of tropical medicine and hygiene" 
##                                                                                                                 28298235 
##                                                                                                        "Malaria journal" 
##                                                                                                                 24861816 
## "Infection, genetics and evolution : journal of molecular epidemiology and evolutionary genetics in infectious diseases" 
##                                                                                                                 24145518 
##                                                                                  "Antimicrobial agents and chemotherapy" 
##                                                                                                                 24007534 
##                                                                                                        "Malaria journal" 
##                                                                                                                 23230341 
##                                                                                     "The Korean journal of parasitology" 
##                                                                                                                 23043980 
##                                                                                              "Experimental parasitology" 
##                                                                                                                 20810806 
##                                                                  "The American journal of tropical medicine and hygiene" 
##                                                                                                                 20412783 
##                                                                                                           "Acta tropica" 
##                                                                                                                 19597012 
##                                                                                          "Clinical microbiology reviews" 
##                                                                                                                 17556611 
##                                                                  "The American journal of tropical medicine and hygiene" 
##                                                                                                                 17519409 
##                                                                                                                   "JAMA" 
##                                                                                                                 17368986 
##                                                                                                 "Trends in parasitology" 
##                                                                                                                 12374849 
##                                        "Proceedings of the National Academy of Sciences of the United States of America"

… and several elements:

date_and_cite <- extract_from_esummary(multi_summs, c("pubdate", "pmcrefcount",  "title"))
knitr::kable(head(t(date_and_cite)), row.names=FALSE)
pubdate pmcrefcount title
2017 Dec Distribution of Mutations Associated with Antifolate and Chloroquine Resistance among Imported <i>Plasmodium vivax</i> in the State of Qatar.
2017 Mar 16 2 Clinical and molecular surveillance of drug resistant vivax malaria in Myanmar (2009-2016).
2014 Aug Prevalence of mutations in the antifolates resistance-associated genes (dhfr and dhps) in Plasmodium vivax parasites from Eastern and Central Sudan.
2014 5 Prevalence of polymorphisms in antifolate drug resistance molecular marker genes pvdhfr and pvdhps in clinical isolates of Plasmodium vivax from Kolkata, India.
2013 Sep 5 3 Prevalence and patterns of antifolate and chloroquine drug resistance markers in Plasmodium vivax across Pakistan.
2012 Dec 13 Prevalence of drug resistance-associated gene mutations in Plasmodium vivax in Central China.

Fetching full records: entrez_fetch()

As useful as the summary records are, sometimes they just don’t have the information that you need. If you want a complete representation of a record you can use entrez_fetch, using the argument rettype to specify the format you’d like the record in.

Fetch DNA sequences in fasta format

Let’s extend the example given in the entrez_link() section about finding transcript for a given gene. This time we will fetch cDNA sequences of those transcripts.We can start by repeating the steps in the earlier example to get nucleotide IDs for refseq transcripts of two genes:

gene_ids <- c(351, 11647)
linked_seq_ids <- entrez_link(dbfrom="gene", id=gene_ids, db="nuccore")
linked_transripts <- linked_seq_ids$links$gene_nuccore_refseqrna
head(linked_transripts)
## [1] "1039766414" "1039766413" "1039766411" "1039766410" "1039766409"
## [6] "563317856"

Now we can get our sequences with entrez_fetch, setting rettype to “fasta” (the list of formats available for each database is give in this table):

all_recs <- entrez_fetch(db="nuccore", id=linked_transripts, rettype="fasta")
class(all_recs)
## [1] "character"
nchar(all_recs)
## [1] 55183

Congratulations, now you have a really huge character vector! Rather than printing all those thousands of bases we can take a peak at the top of the file:

cat(strwrap(substr(all_recs, 1, 500)), sep="\n")
## >XM_006538500.2 PREDICTED: Mus musculus alkaline phosphatase,
## liver/bone/kidney (Alpl), transcript variant X5, mRNA
## GCGCCCGTGGCTTGCGCGACTCCCACGCGCGCGCTCCGCCGGTCCCGCAGTGACTGTCCCAGCCACGGTG
## GGGACACGTGGAAGGTCAGGCTCCCTGGGGACCCACGACCTCCCGCTCCGGACTCCGCGCGCATCTCTTG
## TGGCCTGGCAGGATGATGGACGTGGCGCCCGCTGAGCCGCTACCCAGGACCTCACCCTCGTGCTAAGCAC
## CTGCTCCCGGTGCCCACGCGCCTCCGTAGTCCACAGCTGCGCCCTTCGTGGTCCCTTGGCACTCTGTCCC
## GTTGGTGTCTAAAGTAGTTGGGGAGCAGCAGGAAGAAGGCACGTGCTGCGATCTTTGGCGGGAGAGATCG
## GAGACCGCGTGCTAGTGTCTGTCTGAGAG

If we wanted to use these sequences in some other application we could write them to file:

write(all_recs, file="my_transcripts.fasta")

Alternatively, if you want to use them within an R session
we could write them to a temporary file then read that. In this case I’m using read.dna() from the pylogenetics package ape (but not executing the code block in this vignette, so you don’t have to install that package):

temp <- tempfile()
write(all_recs, temp)
parsed_recs <- ape::read.dna(all_recs, temp)

Fetch a parsed XML document

Most of the NCBI’s databases can return records in XML format. In additional to downloading the text-representation of these files, entrez_fetch() can return objects parsed by the XML package. As an example, we can check out the Taxonomy database’s record for (did I mention they are amazing….) Tetrahymena thermophila, specifying we want the result to be parsed by setting parsed=TRUE:

Tt <- entrez_search(db="taxonomy", term="(Tetrahymena thermophila[ORGN]) AND Species[RANK]")
tax_rec <- entrez_fetch(db="taxonomy", id=Tt$ids, rettype="xml", parsed=TRUE)
class(tax_rec)
## [1] "XMLInternalDocument" "XMLAbstractDocument"

The package XML (which you have if you have installed rentrez) provides functions to get information from these files. For relatively simple records like this one you can use XML::xmlToList:

tax_list <- XML::xmlToList(tax_rec)
tax_list$Taxon$GeneticCode
## $GCId
## [1] "6"
## 
## $GCName
## [1] "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear"

For more complex records, which generate deeply-nested lists, you can use XPath expressions along with the function XML::xpathSApply or the extraction operatord [ and [[ to extract specific parts of the file. For instance, we can get the scientific name of each taxon in T. thermophila’s lineage by specifying a path through the XML

tt_lineage <- tax_rec["//LineageEx/Taxon/ScientificName"]
tt_lineage[1:4]
## [[1]]
## <ScientificName>cellular organisms</ScientificName> 
## 
## [[2]]
## <ScientificName>Eukaryota</ScientificName> 
## 
## [[3]]
## <ScientificName>Alveolata</ScientificName> 
## 
## [[4]]
## <ScientificName>Ciliophora</ScientificName>

As the name suggests, XML::xpathSApply() is a counterpart of base R’s sapply, and can be used to apply a function to nodes in an XML object. A particularly useful function to apply is XML::xmlValue, which returns the content of the node:

XML::xpathSApply(tax_rec, "//LineageEx/Taxon/ScientificName", XML::xmlValue)
##  [1] "cellular organisms" "Eukaryota"          "Alveolata"         
##  [4] "Ciliophora"         "Intramacronucleata" "Oligohymenophorea" 
##  [7] "Hymenostomatida"    "Tetrahymenina"      "Tetrahymenidae"    
## [10] "Tetrahymena"

There are a few more complex examples of using XPath on the rentrez wiki

Using NCBI’s Web History features

When you are dealing with very large queries it can be time consuming to pass long vectors of unique IDs to and from the NCBI. To avoid this problem, the NCBI provides a feature called “web history” which allows users to store IDs on the NCBI servers then refer to them in future calls.

Post a set of IDs to the NCBI for later use: entrez_post()

If you have a list of many NCBI IDs that you want to use later on, you can post them to the NCBI’s severs. In order to provide a brief example, I’m going to post just one ID, the omim identifier for asthma:

upload <- entrez_post(db="omim", id=600807)
upload
## Web history object (QueryKey = 1, WebEnv = NCID_1_27537...)

The NCBI sends you back some information you can use to refer to the posted IDs. In rentrez, that information is represented as a web_history object.

Note that if you have a very long list of IDs you may receive a 414 error when you try to upload them. If you have such a list (and they come from an external sources rather than a search that can be save to a web_history object), you may have to ‘chunk’ the IDs into smaller sets that can processed.

Use a web_history object

Once you have those IDs stored on the NCBI’s servers, you are going to want to do something with them. The functions entrez_fetch() entrez_summary() and entrez_link() can all use web_history objects in exactly the same way they use IDs.

So, we could repeat the last example (finding variants linked to asthma), but this time using the ID we uploaded earlier

asthma_variants <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", web_history=upload)
asthma_variants
## elink object with contents:
##  $web_histories: Objects containing web history information

… if we want to get some genetic information about these variants we need to map our clinvar IDs to SNP IDs:

snp_links <- entrez_link(dbfrom="clinvar", db="snp", 
                         web_history=asthma_variants$web_histories$omim_clinvar,
                         cmd="neighbor_history")
snp_summ <- entrez_summary(db="snp", web_history=snp_links$web_histories$clinvar_snp)
knitr::kable(extract_from_esummary(snp_summ, c("chr", "fxn_class", "global_maf")))
41364547 11558538 20541
chr 11 2 5
fxn_class intron-variant,utr-variant-5-prime intron-variant,missense,nc-transcript-variant,reference,utr-variant-5-prime missense,reference
global_maf A=0.0036/18 T=0.0595/298 A=0.2700/1352

If you really wanted to you could also use web_history objects to download all those thousands of COI sequences. When downloading large sets of data, it is a good idea to take advantage of the arguments retmax and restart to split the request up into smaller chunks. For instance, we could get the first 200 sequences in 50-sequence chunks:

(note: this code block is not executed as part of the vignette to save time and bandwidth):

for( seq_start in seq(1,200,50)){
    recs <- entrez_fetch(db="nuccore", web_history=snail_coi$web_history,
                         rettype="fasta", retmax=50, retstart=seq_start)
    cat(recs, file="snail_coi.fasta", append=TRUE)
    cat(seq_start+49, "sequences downloaded\r")
}

Using API Keys

By default, the NCBI limits users to making only 3 requests per second (and rentrez enforces that limit). Users who register for an “API key” are able to make up to ten requests per second. Getting one of these keys is simple, you just need to register for “my ncbi” account then click on a button in the account settings page.

Once you have an API key, rentrez will allow you to take advantage of it. For one-off cases, this is as simple as adding the api_key argument to given function call. (Note these examples are not executed as part of this document, as the API key used here not a real one).

entrez_link(db="protein", dbfrom="gene", id=93100, api_key ="ABCD123")

It most cases you will want to use your API for each of several calls to the NCBI. rentrez makes this easy by allowing you to set an environment variable ,ENTREZ_KEY. Once this value is set to your key rentrez will use it for all requests to the NCBI. To set the value for a single R session you can use the function set_entrez_key(). Here we set the value and confirm it is available.

set_entrez_key("ABCD123")
Sys.getenv("ENTREZ_KEY")
## [1] "ABCD123"

If you use rentrez often you should edit your .Renviron file (see r help(Startup) for description of this file) to include your key. Doing so will mean all requests you send will take advantage of your API key.

ENTREZ_KEY=ABCD123

As long as an API key is set by one of these methods, rentrez will allow you to make up to ten requests per second.

What next ?

This tutorial has introduced you to the core functions of rentrez, there are almost limitless ways that you could put them together. Check out the wiki for more specific examples, and be sure to read the inline-documentation for each function. If you run into problem with rentrez, or just need help with the package and Eutils please contact us by opening an issue at the github repository

rentrez/inst/doc/rentrez_tutorial.Rmd0000644000176200001440000006470013240146763017562 0ustar liggesusers--- title: Rentrez Tutorial author: "David winter" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true vignette: > %\VignetteIndexEntry{Rentrez Tutorial} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, count_recs, echo=FALSE} library(rentrez) count_recs <- function(db, denom) { nrecs <- rentrez::entrez_db_summary(db)["Count"] round(as.integer(nrecs)/denom, 1) } ``` ## Introduction: The NCBI, entrez and `rentrez`. The NCBI shares a _lot_ of data. At the time this document was compiled, there were `r count_recs("pubmed",1e6)` million papers in [PubMed](http://www.ncbi.nlm.nih.gov/pubmed/), including `r count_recs("pmc", 1e6)` million full-text records available in [PubMed Central](http://www.ncbi.nlm.nih.gov/pubmed/). [The NCBI Nucleotide Database](http://www.ncbi.nlm.nih.gov/nuccore) (which includes GenBank) has data for `r count_recs("nuccore", 1e6)` million different sequences, and [dbSNP](http://www.ncbi.nlm.nih.gov/snp/) describes `r count_recs("snp", 1e6)` million different genetic variants. All of these records can be cross-referenced with the `r round(entrez_search(db="taxonomy", term='species[RANK]')$count/1e6,2)` million species in the [NCBI taxonomy](www.ncbi.nlm.nih.gov/taxonomy) or `r count_recs("omim", 1e3)` thousand disease-associated records in [OMIM](http://www.ncbi.nlm.nih.gov/omim). The NCBI makes this data available through a [web interface](http://www.ncbi.nlm.nih.gov/), an [FTP server](ftp://ftp.ncbi.nlm.nih.gov/) and through a REST API called the [Entrez Utilities](http://www.ncbi.nlm.nih.gov/books/NBK25500/) (`Eutils` for short). This package provides functions to use that API, allowing users to gather and combine data from multiple NCBI databases in the comfort of an R session or script. ## Getting started with the rentrez To make the most of all the data the NCBI shares you need to know a little about their databases, the records they contain and the ways you can find those records. The [NCBI provides extensive documentation for each of their databases](http://www.ncbi.nlm.nih.gov/home/documentation.shtml) and for the [EUtils API that `rentrez` takes advantage of](http://www.ncbi.nlm.nih.gov/books/NBK25501/). There are also some helper functions in `rentrez` that help users learn their way around the NCBI's databases. First, you can use `entrez_dbs()` to find the list of available databases: ```{r, dbs} entrez_dbs() ``` There is a set of functions with names starting `entrez_db_` that can be used to gather more information about each of these databases: **Functions that help you learn about NCBI databases** | Function name | Return | |--------------------------|------------------------------------------------------| | `entrez_db_summary()` | Brief description of what the database is | | `entrez_db_searchable()` | Set of search terms that can used with this database | | `entrez_db_links() ` | Set of databases that might contain linked records | For instance, we can get a description of the somewhat cryptically named database 'cdd'... ```{r, cdd} entrez_db_summary("cdd") ``` ... or find out which search terms can be used with the Sequence Read Archive (SRA) database (which contains raw data from sequencing projects): ```{r, sra_eg} entrez_db_searchable("sra") ``` Just how these 'helper' functions might be useful will become clearer once you've started using `rentrez`, so let's get started. ## Searching databases: `entrez_search()` Very often, the first thing you'll want to do with `rentrez` is search a given NCBI database to find records that match some keywords. You can do this using the function `entrez_search()`. In the simplest case you just need to provide a database name (`db`) and a search term (`term`) so let's search PubMed for articles about the `R language`: ```{r eg_search} r_search <- entrez_search(db="pubmed", term="R Language") ``` The object returned by a search acts like a list, and you can get a summary of its contents by printing it. ```{r print_search} r_search ``` There are a few things to note here. First, the NCBI's server has worked out that we meant R as a programming language, and so included the ['MeSH' term](http://www.ncbi.nlm.nih.gov/mesh) term associated with programming languages. We'll worry about MeSH terms and other special queries later, for now just note that you can use this feature to check that your search term was interpreted in the way you intended. Second, there are many more 'hits' for this search than there are unique IDs contained in this object. That's because the optional argument `retmax`, which controls the maximum number of returned values has a default value of 20. The IDs are the most important thing returned here. They allow us to fetch records matching those IDs, gather summary data about them or find cross-referenced records in other databases. We access the IDs as a vector using the `$` operator: ```{r search_ids} r_search$ids ``` If we want to get more than 20 IDs we can do so by increasing the `ret_max` argument. ```{r searchids_2} another_r_search <- entrez_search(db="pubmed", term="R Language", retmax=40) another_r_search ``` If we want to get IDs for all of the thousands of records that match this search, we can use the NCBI's web history feature [described below](#web_history). ### Building search terms The EUtils API uses a special syntax to build search terms. You can search a database against a specific term using the format `query[SEARCH FIELD]`, and combine multiple such searches using the boolean operators `AND`, `OR` and `NOT`. For instance, we can find next generation sequence datasets for the (amazing...) ciliate _Tetrahymena thermophila_ by using the organism ('ORGN') search field: ```{r, Tt} entrez_search(db="sra", term="Tetrahymena thermophila[ORGN]", retmax=0) ``` We can narrow our focus to only those records that have been added recently (using the colon to specify a range of values): ```{r, Tt2} entrez_search(db="sra", term="Tetrahymena thermophila[ORGN] AND 2013:2015[PDAT]", retmax=0) ``` Or include recent records for either _T. thermophila_ or it's close relative _T. borealis_ (using parentheses to make ANDs and ORs explicit). ```{r, Tt3} entrez_search(db="sra", term="(Tetrahymena thermophila[ORGN] OR Tetrahymena borealis[ORGN]) AND 2013:2015[PDAT]", retmax=0) ``` The set of search terms available varies between databases. You can get a list of available terms or any given data base with `entrez_db_searchable()` ```{r, sra_searchable} entrez_db_searchable("sra") ``` ### Using the Filter field "Filter" is a special field that, as the names suggests, allows you to limit records returned by a search to set of filtering criteria. There is no programmatic way to find the particular terms that can be used with the Filter field. However, the NCBI's website provides an "advanced search" tool for some databases that can be used to discover these terms. For example, to find the list of possible to find all of the terms that can be used to filter searches to the nucleotide database using the [advanced search for that databse](https://www.ncbi.nlm.nih.gov/nuccore/advanced). On that page selecting "Filter" from the first drop-down box then clicking "Show index list" will allow the user to scroll through possible filtering terms. ###Precise queries using MeSH terms In addition to the search terms described above, the NCBI allows searches using [Medical Subject Heading (MeSH)](http://www.ncbi.nlm.nih.gov/mesh) terms. These terms create a 'controlled vocabulary', and allow users to make very finely controlled queries of databases. For instance, if you were interested in reviewing studies on how a class of anti-malarial drugs called Folic Acid Antagonists work against _Plasmodium vivax_ (a particular species of malarial parasite), you could use this search: ```{r, mesh} entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") ``` The complete set of MeSH terms is available as a database from the NCBI. That means it is possible to download detailed information about each term and find the ways in which terms relate to each other using `rentrez`. You can search for specific terms with `entrez_search(db="mesh", term =...)` and learn about the results of your search using the tools described below. ### Advanced counting As you can see above, the object returned by `entrez_search()` includes the number of records matching a given search. This means you can learn a little about the composition of, or trends in, the records stored in the NCBI's databases using only the search utility. For instance, let's track the rise of the scientific buzzword "connectome" in PubMed, programmatically creating search terms for the `PDAT` field: ```{r, connectome, fig.width=5, fig.height=4, fig.align='center'} search_year <- function(year, term){ query <- paste(term, "AND (", year, "[PDAT])") entrez_search(db="pubmed", term=query, retmax=0)$count } year <- 2008:2014 papers <- sapply(year, search_year, term="Connectome", USE.NAMES=FALSE) plot(year, papers, type='b', main="The Rise of the Connectome") ``` ## Finding cross-references : `entrez_link()`: One of the strengths of the NCBI databases is the degree to which records of one type are connected to other records within the NCBI or to external data sources. The function `entrez_link()` allows users to discover these links between records. ###My god, it's full of links To get an idea of the degree to which records in the NCBI are cross-linked we can find all NCBI data associated with a single gene (in this case the Amyloid Beta Precursor gene, the product of which is associated with the plaques that form in the brains of Alzheimer's Disease patients). The function `entrez_link()` can be used to find cross-referenced records. In the most basic case we need to provide an ID (`id`), the database from which this ID comes (`dbfrom`) and the name of a database in which to find linked records (`db`). If we set this last argument to 'all' we can find links in multiple databases: ```{r elink0} all_the_links <- entrez_link(dbfrom='gene', id=351, db='all') all_the_links ``` Just as with `entrez_search` the returned object behaves like a list, and we can learn a little about its contents by printing it. In the case, all of the information is in `links` (and there's a lot of them!): ```{r elink_link} all_the_links$links ``` The names of the list elements are in the format `[source_database]_[linked_database]` and the elements themselves contain a vector of linked-IDs. So, if we want to find open access publications associated with this gene we could get linked records in PubMed Central: ```{r, elink_pmc} all_the_links$links$gene_pmc[1:10] ``` Or if were interested in this gene's role in diseases we could find links to clinVar: ```{r, elink_omim} all_the_links$links$gene_clinvar ``` ###Narrowing our focus If we know beforehand what sort of links we'd like to find , we can to use the `db` argument to narrow the focus of a call to `entrez_link`. For instance, say we are interested in knowing about all of the RNA transcripts associated with the Amyloid Beta Precursor gene in humans. Transcript sequences are stored in the nucleotide database (referred to as `nuccore` in EUtils), so to find transcripts associated with a given gene we need to set `dbfrom=gene` and `db=nuccore`. ```{r, elink1} nuc_links <- entrez_link(dbfrom='gene', id=351, db='nuccore') nuc_links nuc_links$links ``` The object we get back contains links to the nucleotide database generally, but also to special subsets of that database like [refseq](http://www.ncbi.nlm.nih.gov/refseq/). We can take advantage of this narrower set of links to find IDs that match unique transcripts from our gene of interest. ```{r, elinik_refseqs} nuc_links$links$gene_nuccore_refseqrna ``` We can use these ids in calls to `entrez_fetch()` or `entrez_summary()` to learn more about the transcripts they represent. ###External links In addition to finding data within the NCBI, `entrez_link` can turn up connections to external databases. Perhaps the most interesting example is finding links to the full text of papers in PubMed. For example, when I wrote this document the first paper linked to Amyloid Beta Precursor had a unique ID of `25500142`. We can find links to the full text of that paper with `entrez_link` by setting the `cmd` argument to 'llinks': ```{r, outlinks} paper_links <- entrez_link(dbfrom="pubmed", id=25500142, cmd="llinks") paper_links ``` Each element of the `linkouts` object contains information about an external source of data on this paper: ```{r, urls} paper_links$linkouts ``` Each of those linkout objects contains quite a lot of information, but the URL is probably the most useful. For that reason, `rentrez` provides the function `linkout_urls` to make extracting just the URL simple: ```{r just_urls} linkout_urls(paper_links) ``` The full list of options for the `cmd` argument are given in in-line documentation (`?entrez_link`). If you are interested in finding full text records for a large number of articles checkout the package [fulltext](https://github.com/ropensci/fulltext) which makes use of multiple sources (including the NCBI) to discover the full text articles. ###Using more than one ID It is possible to pass more than one ID to `entrez_link()`. By default, doing so will give you a single elink object containing the complete set of links for _all_ of the IDs that you specified. So, if you were looking for protein IDs related to specific genes you could do: ```{r, multi_default} all_links_together <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646")) all_links_together all_links_together$links$gene_protein ``` Although this behaviour might sometimes be useful, it means we've lost track of which `protein` ID is linked to which `gene` ID. To retain that information we can set `by_id` to `TRUE`. This gives us a list of elink objects, each once containing links from a single `gene` ID: ```{r, multi_byid} all_links_sep <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646"), by_id=TRUE) all_links_sep lapply(all_links_sep, function(x) x$links$gene_protein) ``` ## Getting summary data: `entrez_summary()` Having found the unique IDs for some records via `entrez_search` or `entrez_link()`, you are probably going to want to learn something about them. The `Eutils` API has two ways to get information about a record. `entrez_fetch()` returns 'full' records in varying formats and `entrez_summary()` returns less information about each record, but in relatively simple format. Very often the summary records have the information you are after, so `rentrez` provides functions to parse and summarise summary records. ###The summary record `entrez_summary()` takes a vector of unique IDs for the samples you want to get summary information from. Let's start by finding out something about the paper describing [Taxize](https://github.com/ropensci/taxize), using its PubMed ID: ```{r, Summ_1} taxize_summ <- entrez_summary(db="pubmed", id=24555091) taxize_summ ``` Once again, the object returned by `entrez_summary` behaves like a list, so you can extract elements using `$`. For instance, we could convert our PubMed ID to another article identifier... ```{r, Summ_2} taxize_summ$articleids ``` ...or see how many times the article has been cited in PubMed Central papers ```{r, Summ_3} taxize_summ$pmcrefcount ``` ###Dealing with many records If you give `entrez_summary()` a vector with more than one ID you'll get a list of summary records back. Let's get those _Plasmodium vivax_ papers we found in the `entrez_search()` section back, and fetch some summary data on each paper: ```{r, multi_summ} vivax_search <- entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") multi_summs <- entrez_summary(db="pubmed", id=vivax_search$ids) ``` `rentrez` provides a helper function, `extract_from_esummary()` that takes one or more elements from every summary record in one of these lists. Here it is working with one... ```{r, multi_summ2} extract_from_esummary(multi_summs, "fulljournalname") ``` ... and several elements: ```{r, multi_summ3} date_and_cite <- extract_from_esummary(multi_summs, c("pubdate", "pmcrefcount", "title")) knitr::kable(head(t(date_and_cite)), row.names=FALSE) ``` ##Fetching full records: `entrez_fetch()` As useful as the summary records are, sometimes they just don't have the information that you need. If you want a complete representation of a record you can use `entrez_fetch`, using the argument `rettype` to specify the format you'd like the record in. ###Fetch DNA sequences in fasta format Let's extend the example given in the `entrez_link()` section about finding transcript for a given gene. This time we will fetch cDNA sequences of those transcripts.We can start by repeating the steps in the earlier example to get nucleotide IDs for refseq transcripts of two genes: ```{r, transcript_ids} gene_ids <- c(351, 11647) linked_seq_ids <- entrez_link(dbfrom="gene", id=gene_ids, db="nuccore") linked_transripts <- linked_seq_ids$links$gene_nuccore_refseqrna head(linked_transripts) ``` Now we can get our sequences with `entrez_fetch`, setting `rettype` to "fasta" (the list of formats available for [each database is give in this table](http://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/)): ```{r fetch_fasta} all_recs <- entrez_fetch(db="nuccore", id=linked_transripts, rettype="fasta") class(all_recs) nchar(all_recs) ``` Congratulations, now you have a really huge character vector! Rather than printing all those thousands of bases we can take a peak at the top of the file: ```{r, peak} cat(strwrap(substr(all_recs, 1, 500)), sep="\n") ``` If we wanted to use these sequences in some other application we could write them to file: ```r write(all_recs, file="my_transcripts.fasta") ``` Alternatively, if you want to use them within an R session we could write them to a temporary file then read that. In this case I'm using `read.dna()` from the pylogenetics package ape (but not executing the code block in this vignette, so you don't have to install that package): ```r temp <- tempfile() write(all_recs, temp) parsed_recs <- ape::read.dna(all_recs, temp) ``` ###Fetch a parsed XML document Most of the NCBI's databases can return records in XML format. In additional to downloading the text-representation of these files, `entrez_fetch()` can return objects parsed by the `XML` package. As an example, we can check out the Taxonomy database's record for (did I mention they are amazing....) _Tetrahymena thermophila_, specifying we want the result to be parsed by setting `parsed=TRUE`: ```{r, Tt_tax} Tt <- entrez_search(db="taxonomy", term="(Tetrahymena thermophila[ORGN]) AND Species[RANK]") tax_rec <- entrez_fetch(db="taxonomy", id=Tt$ids, rettype="xml", parsed=TRUE) class(tax_rec) ``` The package XML (which you have if you have installed `rentrez`) provides functions to get information from these files. For relatively simple records like this one you can use `XML::xmlToList`: ```{r, Tt_list} tax_list <- XML::xmlToList(tax_rec) tax_list$Taxon$GeneticCode ``` For more complex records, which generate deeply-nested lists, you can use [XPath expressions](https://en.wikipedia.org/wiki/XPath) along with the function `XML::xpathSApply` or the extraction operatord `[` and `[[` to extract specific parts of the file. For instance, we can get the scientific name of each taxon in _T. thermophila_'s lineage by specifying a path through the XML ```{r, Tt_path} tt_lineage <- tax_rec["//LineageEx/Taxon/ScientificName"] tt_lineage[1:4] ``` As the name suggests, `XML::xpathSApply()` is a counterpart of base R's `sapply`, and can be used to apply a function to nodes in an XML object. A particularly useful function to apply is `XML::xmlValue`, which returns the content of the node: ```{r, Tt_apply} XML::xpathSApply(tax_rec, "//LineageEx/Taxon/ScientificName", XML::xmlValue) ``` There are a few more complex examples of using `XPath` [on the rentrez wiki](https://github.com/ropensci/rentrez/wiki) ##Using NCBI's Web History features When you are dealing with very large queries it can be time consuming to pass long vectors of unique IDs to and from the NCBI. To avoid this problem, the NCBI provides a feature called "web history" which allows users to store IDs on the NCBI servers then refer to them in future calls. ###Post a set of IDs to the NCBI for later use: `entrez_post()` If you have a list of many NCBI IDs that you want to use later on, you can post them to the NCBI's severs. In order to provide a brief example, I'm going to post just one ID, the `omim` identifier for asthma: ```{r, asthma} upload <- entrez_post(db="omim", id=600807) upload ``` The NCBI sends you back some information you can use to refer to the posted IDs. In `rentrez`, that information is represented as a `web_history` object. Note that if you have a very long list of IDs you may receive a 414 error when you try to upload them. If you have such a list (and they come from an external sources rather than a search that can be save to a `web_history` object), you may have to 'chunk' the IDs into smaller sets that can processed. ###Get a `web_history` object from `entrez_search` or `entrez_link()` In addition to directly uploading IDs to the NCBI, you can use the web history features with `entrez_search` and `entrez_link`. For instance, imagine you wanted to find all of the sequences of the widely-studied gene COI from all snails (which are members of the taxonomic group Gastropoda): ```{r, snail_search} entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]") ``` That's a lot of sequences! If you really wanted to download all of these it would be a good idea to save all those IDs to the server by setting `use_history` to `TRUE` (note you now get a `web_history` object along with your normal search result): ```{r, snail_history} snail_coi <- entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]", use_history=TRUE) snail_coi snail_coi$web_history ``` Similarity, `entrez_link()` can return `web_history` objects by using the `cmd` `neighbor_history`. Let's find genetic variants (from the clinvar database) associated with asthma (using the same OMIM ID we identified earlier): ```{r, asthma_links} asthma_clinvar <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", id=600807) asthma_clinvar$web_histories ``` As you can see, instead of returning lists of IDs for each linked database (as it would be default), `entrez_link()` now returns a list of web_histories. ###Use a `web_history` object Once you have those IDs stored on the NCBI's servers, you are going to want to do something with them. The functions `entrez_fetch()` `entrez_summary()` and `entrez_link()` can all use `web_history` objects in exactly the same way they use IDs. So, we could repeat the last example (finding variants linked to asthma), but this time using the ID we uploaded earlier ```{r, asthma_links_upload} asthma_variants <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", web_history=upload) asthma_variants ``` ... if we want to get some genetic information about these variants we need to map our clinvar IDs to SNP IDs: ```{r, links} snp_links <- entrez_link(dbfrom="clinvar", db="snp", web_history=asthma_variants$web_histories$omim_clinvar, cmd="neighbor_history") snp_summ <- entrez_summary(db="snp", web_history=snp_links$web_histories$clinvar_snp) knitr::kable(extract_from_esummary(snp_summ, c("chr", "fxn_class", "global_maf"))) ``` If you really wanted to you could also use `web_history` objects to download all those thousands of COI sequences. When downloading large sets of data, it is a good idea to take advantage of the arguments `retmax` and `restart` to split the request up into smaller chunks. For instance, we could get the first 200 sequences in 50-sequence chunks: (note: this code block is not executed as part of the vignette to save time and bandwidth): ```r for( seq_start in seq(1,200,50)){ recs <- entrez_fetch(db="nuccore", web_history=snail_coi$web_history, rettype="fasta", retmax=50, retstart=seq_start) cat(recs, file="snail_coi.fasta", append=TRUE) cat(seq_start+49, "sequences downloaded\r") } ``` ## Using API Keys By default, the NCBI limits users to making only 3 requests per second (and `rentrez` enforces that limit). Users who register for an "API key" are able to make up to ten requests per second. Getting one of these keys is simple, you just need to [register for "my ncbi" account](https://www.ncbi.nlm.nih.gov/account/) then click on a button in the [account settings page](https://www.ncbi.nlm.nih.gov/account/settings/). Once you have an API key, rentrez will allow you to take advantage of it. For one-off cases, this is as simple as adding the `api_key` argument to given function call. (Note these examples are not executed as part of this document, as the API key used here not a real one). ```r entrez_link(db="protein", dbfrom="gene", id=93100, api_key ="ABCD123") ``` It most cases you will want to use your API for each of several calls to the NCBI. `rentrez` makes this easy by allowing you to set an environment variable ,`ENTREZ_KEY`. Once this value is set to your key `rentrez` will use it for all requests to the NCBI. To set the value for a single R session you can use the function `set_entrez_key()`. Here we set the value and confirm it is available. ```{r, set_key} set_entrez_key("ABCD123") Sys.getenv("ENTREZ_KEY") ``` If you use `rentrez` often you should edit your `.Renviron` file (see `r help(Startup)` for description of this file) to include your key. Doing so will mean all requests you send will take advantage of your API key. ```ini ENTREZ_KEY=ABCD123 ``` As long as an API key is set by one of these methods, `rentrez` will allow you to make up to ten requests per second. ## What next ? This tutorial has introduced you to the core functions of `rentrez`, there are almost limitless ways that you could put them together. [Check out the wiki](https://github.com/ropensci/rentrez/wiki) for more specific examples, and be sure to read the inline-documentation for each function. If you run into problem with rentrez, or just need help with the package and `Eutils` please contact us by opening an issue at the [github repository](https://github.com/ropensci/rentrez/issues) rentrez/tests/0000755000176200001440000000000013061345025013104 5ustar liggesusersrentrez/tests/testthat/0000755000176200001440000000000013237161662014754 5ustar liggesusersrentrez/tests/testthat/test_search.r0000644000176200001440000000216313017107141017431 0ustar liggesuserscontext("search") #setup gsearch <- entrez_global_query("Heliconius") pubmed_search <- entrez_search(db = "pubmed", term = "10.1016/j.ympev.2010.07.013[doi]") json_search <- entrez_search(db="pubmed", term = "10.1016/j.ympev.2010.07.013[doi]", retmode='json') test_that("Global query works",{ #global query expect_that(gsearch, is_a("numeric")) expect_that(names(gsearch), is_a("character")) expect_true(sum(gsearch) > 0 ) }) test_that("Entrez query works",{ #entrez query expect_that(pubmed_search, is_a("esearch")) expect_that(pubmed_search$ids, is_identical_to("20674752")) }) test_that("Entrez query works just as well with xml/json",{ expect_that(json_search, is_a("esearch")) expect_that(json_search$ids, is_identical_to("20674752")) expect_equal(names(pubmed_search),names(json_search)) }) test_that("we can print search results", { expect_output(print(pubmed_search), "Entrez search result with \\d+ hits") expect_output(print(json_search), "Entrez search result with \\d+ hits") }) rentrez/tests/testthat/test_httr_post.r0000644000176200001440000000125213113710564020216 0ustar liggesuserscontext("POST (the HTTP verb)") are_there_any_cancer_papers <- entrez_search(db="pubmed", term="Cancer", retmax=201) search_ids <- are_there_any_cancer_papers$ids test_that("We can POST to NCBI epost", { wh <- entrez_post(db="pubmed", id=search_ids) expect_that(wh, is_a("web_history")) expect_that(as.integer(wh$QueryKey), is_a("integer")) expect_false(is.na(as.integer(wh$QueryKey))) }) test_that("We can fecth using POST", { fetched_ids <- entrez_fetch(db="pubmed", id=search_ids, rettype="uilist") expect( all( strsplit(fetched_ids, "\n")[[1]] %in% search_ids), "fetched IDs do not match sent IDs when using httr::POST" ) }) rentrez/tests/testthat/test_query.r0000644000176200001440000000406013113710564017335 0ustar liggesuserscontext("query") test_that("Query building functions work", { #concatenate multiple IDs, include entrez terms query <- rentrez:::make_entrez_query("efetch", db="nuccore", id=c(443610374, 443610372), config=list(), retmode="txt", rettype="fasta") nrecs <- length(gregexpr(">", query)[[1]]) expect_equal(nrecs, 2) #should be able to give ints or characters to id and get a url query <- rentrez:::make_entrez_query("efetch", db="nuccore", id=c("443610374", "443610372"), retmode="txt", config=list(), rettype="fasta") nrecs <- length(gregexpr(">", query)[[1]]) expect_equal(nrecs, 2) #specific function have right "require one of" settings expect_that(entrez_fetch(db="nuccore", rettype="fasta"), throws_error()) expect_that(entrez_summary(db="nuccore", web_history="A", id=123), throws_error()) expect_that(entrez_link(db="nuccore", dbfrom="pubmed"), throws_error()) #httr pases on errors #404 expect_error(rentrez:::make_entrez_query("non-eutil", id=12, db="none", config=list())) #400 expect_error(rentrez:::make_entrez_query("efetch", id=1e-17, config=list(), db="nuccore")) }) test_that("We give a useful error when an empty ID vector is passed", { ET <- entrez_search(db="taxonomy", term="Extraterrestrial[Organism]") expect_error(entrez_fetch(db="taxonomy", id= ET$ids, rettype="uilist")) }) rentrez/tests/testthat/test_citmatch.r0000644000176200001440000000063213017107141017757 0ustar liggesuserscontext("Cite matching") test_that("Citation matching works",{ ex_cites <- c("proc natl acad sci u s a|1991|88|3248|mann bj|test1|", "science|1987|235|182|palmenberg ac|test2|") res <- entrez_citmatch(ex_cites) expect_that(res, is_a("character")) expect_equal(res, c("2014248", "3026048")) expect_warning(entrez_citmatch(c("some|nonsense|", ex_cites))) }) rentrez/tests/testthat/test_summary.r0000644000176200001440000001044113111437373017667 0ustar liggesuserscontext("fetching and parsing summary recs") fake_ids <- sample(1e5, 501) pop_ids = c("307082412", "307075396", "307075338", "307075274") pop_summ_xml <- entrez_summary(db="popset", id=pop_ids, version="1.0") pop_summ_json <- entrez_summary(db="popset", id=pop_ids, version="2.0") pop_summ_xml2 <- entrez_summary(db="popset", id=pop_ids, version="1.0", retmode="xml") test_that("Functions to fetch summaries work", { #tests expect_that(pop_summ_xml, is_a("list")) expect_that(pop_summ_json, is_a("list")) expect_that(pop_summ_xml[[4]], is_a("esummary")) expect_that(pop_summ_json[[4]], is_a("esummary")) sapply(pop_summ_json, function(x) expect_that(x[["title"]], matches("Muraenidae")) ) }) test_that("List elements in XML are parsable", { rec <- entrez_summary(db="pubmed", id=25696867, version="1.0") expect_named(rec$History) expect_gt(length(rec$History), 0) }) test_that("Version 2 xml records can be fetched and parsed", { sapply(pop_summ_xml2, function(x) expect_that(x[["Title"]], matches("Muraenidae"))) expect_that(length(pop_summ_xml2[[1]]), is_more_than(12)) }) test_that("JSON and XML objects are similar", { #It would be nice to test whether the xml and json records # have the same data in them, but it turns out they don't # when they leave the NCBI, so let's ensure we can get some # info from each file, even if they won't be exactly the same sapply(pop_summ_xml, function(x) expect_that(x[["Title"]], matches("Muraenidae"))) sapply(pop_summ_json, function(x) expect_that(x[["title"]], matches("Muraenidae"))) expect_that(length(pop_summ_xml[[1]]), is_more_than(12)) expect_that(length(pop_summ_json[[1]]), is_more_than(12)) }) test_that("Error whent tring to fetch 1.0 summaries as json", { expect_error( entrez_summary("pubmed", id = fake_ids[1:10], version="1.0", retmode="json") ) }) test_that("We can print summary records", { expect_output(print(pop_summ_json), "List of 4 esummary records") expect_output(print(pop_summ_json[[1]]), "esummary result with \\d+ items") expect_output(print(pop_summ_xml), "List of 4 esummary records") expect_output(print(pop_summ_xml[[1]]), "esummary result with \\d+ items") }) test_that("We can detect errors in esummary records", { expect_warning( entrez_summary(db="pmc", id=c(4318541212,4318541), version="1.0") ) expect_warning( entrez_summary(db="pmc", id=c(4318541212,4318541)) ) }) test_that("We can detect errors in esummary returns", { expect_error( entrez_summary(db="pmc", id=fake_ids, version="2.0") ) }) test_that("We can extract elements from esummary object", { expect_that(extract_from_esummary(pop_summ_xml, c("Title", "TaxId")), is_a("matrix")) expect_that(extract_from_esummary(pop_summ_xml, c("Title", "TaxId"), simplify=FALSE), is_a("list")) expect_that(extract_from_esummary(pop_summ_xml2, c("Title", "TaxId"), simplify=FALSE), is_a("list")) expect_that(extract_from_esummary(pop_summ_json, "title"), is_a("character")) }) test_that("We can extract elements from a single esummary", { expect_that(extract_from_esummary(pop_summ_xml[[1]], c("Title", "TaxId")), is_a("list")) expect_that(extract_from_esummary(pop_summ_xml[[1]], "Gi"), is_a("integer")) expect_that(extract_from_esummary(pop_summ_xml[[1]], "Gi", FALSE), is_a("list")) }) test_that("We can get a list of one element if we ask for it", { expect_that(entrez_summary(db="popset", id=307075396, always_return_list=TRUE), is_a("list")) expect_that(entrez_summary(db="popset", id=307075396), is_a("esummary")) }) test_that("We can fetch summaries on versioned sequences", { old_rec = entrez_summary(db="nuccore", id="AF123456.1") new_rec = entrez_summary(db="nuccore", id="AF123456.2") expect_match(old_rec$title, "testis-specific mRNA") expect_match(new_rec$title, "doublesex and mab-3 related transcription factor") }) rentrez/tests/testthat/test_post.r0000644000176200001440000000232713017107141017153 0ustar liggesuserscontext("entrez_post") prot_ids = c(15718680,157427902) ret <- entrez_post(id=prot_ids, db="protein") test_that("we can post ids", { qk <- ret$QueryKey expect_that(as.integer(qk), is_a("integer")) expect_false(is.na(as.integer(qk))) expect_that(ret$QueryKey, is_a("character")) }) test_that("we can add to WebEnv", { ret2 <- entrez_post(id=119703751, db="protein", web_history=ret) first <- entrez_summary(db="protein", web_history=ret) second <- entrez_summary(db="protein", web_history=ret2) expect_equal(ret2$QueryKey, "2") expect_equal(ret2$WebEnv, ret$WebEnv) expect_equal(length(first), 2) expect_that(second, is_a("esummary"))#i.e. justone }) test_that("Example works", { so_many_snails <- entrez_search(db="nuccore", "Gastropoda[Organism] AND COI[Gene]", retmax=200) upload <- entrez_post(db="nuccore", id=so_many_snails$ids) first <- entrez_fetch(db="nuccore", rettype="fasta", web_history=upload, retstart=0, retmax=4) nrecs <- length(gregexpr(">", first)[[1]]) expect_equal(nrecs, 4) }) test_that("We can print a post result", { expect_output(print(ret), "\\(QueryKey = \\d+, WebEnv = [A-Z0-9_]+\\.\\.\\.\\)") }) rentrez/tests/testthat/test_api_key.r0000644000176200001440000000155213237161662017622 0ustar liggesuserscontext("Using API keys") test_that("API keys can be passed as normal args", { payload <- make_entrez_query(util="test", config=list(), id=100, api_key="ABC", debug_mode=TRUE) expect_match(payload$args$api_key, "ABC") }) test_that("API keys can be passed from ENV vars", { set_entrez_key("ABC") payload <- make_entrez_query(util="test", config=list(), id=100, debug_mode=TRUE) expect_match(payload$args$api_key, "ABC") }) test_that("Rate limiting changes when API key set", { # with key = 10 per sec set_entrez_key("ABC") payload <- make_entrez_query(util="test", config=list(), id=100, debug_mode=TRUE) expect_equal(sleep_time(payload$args), 0.1) # No key = 3 per sec set_entrez_key("") payload <- make_entrez_query(util="test", config=list(), id=100, debug_mode=TRUE) expect_equal(sleep_time(payload$args), 1/3) }) rentrez/tests/testthat/test_docs.r0000644000176200001440000000107213017107141017112 0ustar liggesusers# test any parts of the README or tutorial that aren't already part of the test # suite. Note, the final example of the README makes a lot calls to NCBI, so is # not included here context("documentation") test_that("Examples in documentation work", { #setup hox_paper <- entrez_search(db="pubmed", term="10.1038/nature08789[doi]") katipo_search <- entrez_search(db="popset", term="Latrodectus katipo[Organism]") expect_that(hox_paper$ids, equals("20203609")) expect_true(katipo_search$count >= 6) }) rentrez/tests/testthat/test_net.r0000644000176200001440000000024613017107141016752 0ustar liggesuserscontext("Network") test_that("The NCBI is contactable from this comptuter /",{ expect_true(!httr::http_error("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")) }) rentrez/tests/testthat/test_httr.r0000644000176200001440000000072413017107141017146 0ustar liggesuserscontext("httr option passing") #most config options don't produce capture-able output, so instead # we will test if we raise an error when we us a non-existant proxy to # connect to the internet test_that("httr config options can be passed to rentrez functions",{ expect_error(entrez_search(db="popset", term="test", config=use_proxy(url="0.0.0.0", port=80 ))) }) rentrez/tests/testthat/test_webenv.r0000644000176200001440000000117013017107141017447 0ustar liggesuserscontext("WebEnv") test_that("Searches using WebEnv features work", { #setup web_env_search <- entrez_search(db="nuccore", term="Gastropoda[Organism] AND COI[Gene]", use_history=TRUE) wh <- web_env_search$web_history snail_coi <- entrez_fetch(db = "nuccore", web_history=wh, rettype = "fasta", retmax = 10) #test expect_that(wh$WebEnv, is_a("character")) expect_that(as.integer(wh$QueryKey), is_a("integer")) expect_that(snail_coi, is_a("character")) expect_that(length(strsplit(snail_coi, ">")[[1]]), equals(11)) }) rentrez/tests/testthat/test_parse.r0000644000176200001440000000315713017107141017302 0ustar liggesuserscontext("result-parsers") raw_rec <- entrez_fetch(db="pubmed", id=20674752, rettype="xml") xml_rec <- entrez_fetch(db="pubmed", id=20674752, rettype="xml", parsed=TRUE) multi_rec <- entrez_fetch(db="pubmed", id=c(22883857, 25042335, 20203609,11959827), rettype="xml", parsed=TRUE) parsed_raw <- parse_pubmed_xml(raw_rec) parsed_rec <- parse_pubmed_xml(xml_rec) parsed_multi <- parse_pubmed_xml(multi_rec) test_that("pubmed file parsers work",{ expect_that(raw_rec, is_a("character")) expect_that(parsed_raw, is_a("pubmed_record")) expect_that(parsed_rec, is_a("pubmed_record")) expect_that(names(parsed_rec), is_a("character")) expect_that(parsed_rec$pmid, is_identical_to("20674752")) expect_that(parsed_multi, is_a("multi_pubmed_record")) expect_that(parsed_multi[[1]], is_a("pubmed_record")) expect_that(length(parsed_multi), equals(4)) # Older (buggier) versions of the pubmed parser included data from every # record in an xml file in each parsed record. If that error is # re-introduced there will be 25 authors in each record and this will fail expect_that(length(parsed_multi[[1]]$authors), equals(1)) }) test_that("we can print pubmed records", { expect_output(print(parsed_rec), "Pubmed record") expect_output(print(parsed_multi), "List of 4 pubmed records") }) test_that("We warn about unknown pubmed record types", { rec = entrez_fetch(db="pubmed", id=25905152, rettype="xml") expect_warning(parsed_rec <- parse_pubmed_xml(rec)) expect_output(print(parsed_rec), "Pubmed record \\(empty\\)") }) rentrez/tests/testthat/test_fetch.r0000644000176200001440000000305313111437373017264 0ustar liggesuserscontext("fetching records") pop_ids = c("307082412", "307075396", "307075338", "307075274") coi <- entrez_fetch(db = "popset", id = pop_ids[1], rettype = "fasta") xml_rec <- entrez_fetch(db = "popset", id=pop_ids[1], rettype="native", parsed=TRUE) raw_rec <- entrez_fetch(db = "popset", id=pop_ids[1], rettype="native") acc_old = "AF123456.1" acc_new = "AF123456.2" test_that("httr does no warn about inferred encoding", { expect_message( entrez_fetch(db = "popset", id=pop_ids[1], rettype="uilist"), NA) }) test_that("Fetching sequences works", { expect_that(length(strsplit(coi, ">")[[1]]), equals(30)) }) test_that("Entrez_fetch record parsing works", { expect_that(raw_rec, is_a("character")) expect_that(xml_rec, is_a("XMLInternalDocument")) expect_error( entrez_fetch(db="popset", id="307082412", rettype="fasta", parsed=TRUE), "At present, entrez_fetch can only parse XML records, got fasta" ) }) test_that("Entrez fetch can download versioned sequences", { #The two versions of this sequence have different annotations. We can check #that we are getting the correct version of the record by checking the name #of each sequence reflects the change in annotation. old_rec = entrez_fetch(db="nuccore", id="AF123456.1", rettype="fasta") new_rec = entrez_fetch(db="nuccore", id="AF123456.2", rettype="fasta") expect_match(old_rec, "testis-specific mRNA") expect_match(new_rec, "doublesex and mab-3 related transcription factor") }) rentrez/tests/testthat/test_link.r0000644000176200001440000000441013111437373017126 0ustar liggesuserscontext("elink") elinks_mixed <- entrez_link(dbfrom = "pubmed", id = c(19880848, 22883857), db = "all") elinks_by_id <- entrez_link(dbfrom = "pubmed", id = c(19880848, 22883857), db = "all", by_id=TRUE) # #We should maybe download these xmls and test the internal functions # as these really take some downloading,... especially the lib. links? message("(this may take some time, have to download many records)") commands <- c("neighbor_history", "neighbor_score", "acheck", "ncheck", "lcheck", "llinks", "llinkslib", "prlinks") all_the_commands <- lapply(commands, function(cmd_arg) entrez_link(db="pubmed", dbfrom="pubmed", id=19880848, cmd=cmd_arg) ) test_that("The record-linking funcitons work",{ expect_that(elinks_mixed, is_a("elink")) expect_that(names(elinks_mixed$links), is_a("character")) expect_true(length(elinks_mixed$links$pubmed_mesh_major) > 0) }) test_that("by_id mode works for elinks", { expect_that(elinks_by_id, is_a("elink_list")) expect_that(length(elinks_by_id), equals(2)) expect_that(elinks_by_id[[1]], is_a("elink")) }) test_that("elink printing behaves", { expect_output(print(elinks_by_id), "List of 2 elink objects,each containing") for(ret in all_the_commands){ expect_output(print(ret), "elink object with contents:\\s+\\$[A-Za-z]+") } }) test_that("We detect missing ids from elink results",{ expect_warning( entrez_link(dbfrom="pubmed", db="all", id=c(20203609,2020360999999,20203610), by_id=TRUE) ) }) test_that("Elink sub-elements can be acessed and printed", { expect_output(print(all_the_commands[[3]][[1]]), "elink result with information from \\d+ databases") expect_output(print(all_the_commands[[8]]$linkouts[[1]]), "Linkout from [ A-Za-z]+\\s+\\$Url") }) test_that("URls can be extracted from elink objs", { for(idx in 6:8){ urls <- linkout_urls(all_the_commands[[idx]]) expect_that(urls, is_a("list")) expect_that(urls[[1]], is_a("character")) } }) test_that("Elink errors on mis-spelled/unknown cmds",{ expect_error(rcheck <- entrez_link(dbfrom = "pubmed", id = 19880848, db = "all", cmd='rcheck')) }) rentrez/tests/testthat/test_info.r0000644000176200001440000000274513017107141017125 0ustar liggesuserscontext("einfo functions") einfo_rec <- entrez_info() pm_rec <- entrez_info(db="pubmed") test_that(" can get xml recs from einfo", { expect_that(einfo_rec, is_a("XMLInternalDocument")) expect_that(pm_rec, is_a("XMLInternalDocument")) }) dbs <- entrez_dbs() cdd <- entrez_db_summary("cdd") test_that(" We can get summary information on DBs", { expect_that(dbs, is_a("character")) expect_true("pubmed" %in% dbs) expect_that(cdd, is_a("character")) expect_named(cdd) }) search_fields <- entrez_db_searchable("pmc") sf_df <- as.data.frame(search_fields) test_that("We can retrieve serach fields", { expect_that(search_fields, is_a("eInfoSearch")) expect_named(search_fields$GRNT) expect_that(sf_df, is_a("data.frame")) }) omim_links <- entrez_db_links("omim") omim_df <- as.data.frame(omim_links) test_that("We can retreive linked dbs", { expect_that(omim_links, is_a("eInfoLink")) expect_named(omim_links[[1]]) expect_that(omim_df, is_a("data.frame")) expect_equal(nrow(omim_df), length(omim_links)) }) test_that("We can prink elink objects", { expect_output(print(omim_links), "Databases with linked records for database 'omim'") expect_output(print(search_fields), "Searchable fields for database 'pmc'") }) test_that("We can print elements from einfo object", { expect_output(print(omim_links$gene), "Name: omim_gene\n") expect_output(print(search_fields$GRNT), "Name: GRNT\n") expect_output(print(cdd), "DbName: cdd") }) rentrez/tests/test-all.R0000644000176200001440000000057413017107141014756 0ustar liggesuserslibrary("testthat") #All of the tests rely on the API existing and behaving as documented. However, #the API occasionally falls over or stops working which lets to errors on CRAN. #Because we use travis CI we will hear about any test failures as soon as they #happen. So, let's skill all tests on CRAN: if(identical(Sys.getenv("NOT_CRAN"), "true")){ test_check("rentrez") } rentrez/NAMESPACE0000644000176200001440000000222113237161662013166 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(as.data.frame,eInfoList) S3method(extract_from_esummary,esummary) S3method(extract_from_esummary,esummary_list) S3method(print,eInfoEntry) S3method(print,eInfoLink) S3method(print,eInfoSearch) S3method(print,elink) S3method(print,elink_classic) S3method(print,elink_list) S3method(print,esearch) S3method(print,esummary) S3method(print,esummary_list) S3method(print,linkout) S3method(print,multi_pubmed_record) S3method(print,pubmed_record) S3method(print,web_history) export(entrez_citmatch) export(entrez_db_links) export(entrez_db_searchable) export(entrez_db_summary) export(entrez_dbs) export(entrez_fetch) export(entrez_global_query) export(entrez_info) export(entrez_link) export(entrez_post) export(entrez_search) export(entrez_summary) export(extract_from_esummary) export(linkout_urls) export(parse_pubmed_xml) export(set_entrez_key) importFrom(XML,xmlChildren) importFrom(XML,xmlGetAttr) importFrom(XML,xmlName) importFrom(XML,xmlSApply) importFrom(XML,xmlToList) importFrom(XML,xmlTreeParse) importFrom(XML,xmlValue) importFrom(XML,xpathApply) importFrom(XML,xpathSApply) importFrom(jsonlite,fromJSON) rentrez/NEWS0000644000176200001440000001170113240145272012442 0ustar liggesusersVersion 1.2.0 ------------------ rentrez updated to reflect NCBIs new API policy, allowing more requests from users with registered keys. (Issues #115 -- #117). CITATION updated to reflect publication in The R Journal. Minor changes * clarification of search syntax in docs (issue #120) * ORCID ids added for all authors (issue #118) Version 1.1.0 ------------------ As of this release, rentrez will use httr::POST when sending > 200 ids to the NCBI. This should make working with large ID sets easier (thanks to the NCBI for supporing the POST methods, Reed Cartwright and Chris Stubben for pushing me on issue #89). Other minor changes: * Pass on error messages from NCBI when too many records are requested from `entrez_summary` (Issue #106) * Useful error message when trying to send an empty ID set to NCBI (Issue #107) Version 1.0.4 ------------------ Update to documentation and tests to accommodate versioned accessions now available from NCBI (see ?entrez_fetch and the vignette) Version 1.0.3 ------------------ Update to only use https * NCBI is goinh all https, rentrez will only use https from now on. * Added links to repo/bug reporting to DESCRIPTION * Documented changes to sequence database XML records * Allow automatic parsing of XML flavours Version 1.0.2 ------------------- Bug fix release * Tests now work with testthat 1.0.0 * All calls to ncbi specify encoding is UTF-8 (saving error messages) * HTTP Error codes associated with large requests now give the user a hint to check out the documentation for web-history features Version 1.0.1 ------------------- Bug fix release * Properly format "by_id" mode URLS (bug exposed by httr 1.0.1) * Handle case in which some IDs passed to "by_id" mode are invalide (thanks Zachary Foster for report) * Documentation updated to reflect OMIM->SNP links no longer possible * Use Rmarkdown (not knitr) as vignette builder * Return NCBI error messages are text when they exist Version 1.0.0 -------------------- * new function extract_from_esummary() for extracting like-named elements from a list esummary records (e.g. get all "Title" fields from a list of PubMed esummaries) * Support for `cmd` option in entrez_link (breaks backward compatibility) * Allows discovery of external links from and use of web_history * New helper function linkout_urls to get URLs form external links * Support for 'by_id' mode for entrez_link. Pass a vector of IDs to entrez_link, (optionally) get a list on elink objects back (one per ID) * New web_history object makes using NCBI Web History features easier * All of these changes documented in new vignette * Han Guangchun added as contributor for his pull requests * New tests, minor bug fixes and extended documentation Version 0.4.1 --------------------- * Bug fix: The example for entrez_summary contained a typo which made it fail (being wrapped in dontest this hadn't previously shown up). Version 0.4 ------------------------ * entrez_summary now fetches 'version 2.0' esummary records from NCBI * This change may break some scripts. In particular, the names of some elements in esummary records have changed. Broken scripts shold produce a helpful error message, and using entrez_summary(..., version="1.0") should fix it. More details are given in the help to entrez_summary. * When version 2.0 records are requested entrez_summary fetches the json record. * New helper functions for einfo Eutil * entrez_dbs() lists avaliable databases. * entrez_db_summary() gets summary information about a given database. * entrez_db_links() lists databases against which a given db's records might be cross referenced. * entrez_db_searchable() lists search terms avaliable for a given database. * Nicer print functions for search and summary objects * New dependancy on jsonlite for handling json records. * Bunch of bugs squashed and typos cleaned up Version 0.3.1 ------------------------ * Squashed a bug in the vignette which wrote to users $HOME Version 0.3 ------------------------ * using httr to handle HTTP GETs and some url building * parsing for esummary parsing for clinvar database * Scott Chamberlain added as contributer for above * Pubmed parser handles multi-record files * html vignette included Version 0.2.4 ------------------------- * minor release to fix bug in esummary parsing Version 0.2.3 --------------------------------- * Edited license/description to meet CRAN requiremens * Added sentence to description to summarise the package Version 0.2.2 -------------------------------- * Parsing of esummary xmls is now much nicer. * S3 items to represent most results * Tests to cover all functions Version 0.1.1 --------------------------------- * First release on CRAN + now part of ROpenSci * Functions cover the whole EUtils API rentrez/R/0000755000176200001440000000000013240132625012142 5ustar liggesusersrentrez/R/entrez_global_query.r0000755000176200001440000000224413237161662016417 0ustar liggesusers#' Find the number of records that match a given term across all NCBI Entrez databases #' #' #' #'@export #'@param term the search term to use #'@param config vector configuration options passed to httr::GET #'@param ... additional arguments to add to the query #'@seealso \code{\link[httr]{config}} for available configs #'@return a named vector with counts for each a database #' #' @examples #' #' NCBI_data_on_best_butterflies_ever <- entrez_global_query(term="Heliconius") entrez_global_query <- function(term, config=NULL, ...){ response <- make_entrez_query("egquery", term=gsub(" ", "+", term), config=config, ...) record <- xmlTreeParse(response, useInternalNodes=TRUE) db_names <- xpathSApply(record, "//ResultItem/DbName", xmlValue) get_Ids <- function(dbname){ path <- paste("//ResultItem/DbName[text()='", dbname, "']/../Count", sep="") res <- as.numeric(xpathSApply(record, path, xmlValue)) } #NCBI limits requests to three per second res <- structure(sapply(db_names, get_Ids), names=db_names) return(res) } rentrez/R/base.r0000755000176200001440000001210213237161662013247 0ustar liggesusers#What's going on under the hood. As far as possible we are following the best #practices for API packages suggested by hadly/httr: # # http://cran.r-project.org/web/packages/httr/vignettes/api-packages.html # #and also conforming to the NBCI's requirements about rate limiting and #adding identifiers to each request: # # http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requirements # #As per NCBI's documentation -- we set tool developer's email and tool name: entrez_email <- function() 'david.winter@gmail.com' entrez_tool <- function() 'rentrez' #Create a URL for the EUtils API. # # This function is used by all the API-querying functions in rentrez to build # the appropriate url. Required arguments for each endpoint are handled by # specific funcitons. All of these functions can use the id_or_webenv() function # (below) to ensure that at least on of these arguments are provided and the # sleep_time() function to set the approrate time to wait between requests. # # if debug_mode is set to TRUE the function returns a list with the URL and # arguments that would have been passed to GET or POST (useful for debugging # and used in the test suite). make_entrez_query <- function(util, config, interface=".fcgi?", by_id=FALSE, debug_mode=FALSE, ...){ uri <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/", util, interface) args <- list(..., email=entrez_email(), tool=entrez_tool()) if(!("api_key" %in% names(args))){ #no api key set, try to use the sytem var if(is_entrez_key_set()){ args$api_key <- Sys.getenv('ENTREZ_KEY') } } if("id" %in% names(args)){ if(by_id){ ids_string <- paste0("id=", args$id, collapse="&") args$id <- NULL uri <- paste0(uri, ids_string) } else { args$id <- paste(args$id, collapse=",") } } if(debug_mode){ return( list(uri = uri, args=args ) ) } if(length(args$id) > 200){ response <- httr::POST(uri, body=args, config= config) } else { response <- httr::GET(uri, query=args, config= config) } entrez_check(response) Sys.sleep(sleep_time(args)) httr::content(response, as="text", encoding="UTF-8") } #set the sleep time, depending on presence of api_key in the arguments. Used by # make_entrez_query sleep_time <- function(argument_list){ if("api_key" %in% names(argument_list)){ return(0.1) } 1/3 } ## # Check for that we have either the ID or the web-history functions are # specified for those functions that need one. ## id_or_webenv <- function(){ args <- sys.frame(sys.parent()) msg <- "Must specify either (not both) 'id' or 'web_history' arguments" if(!is.null(args$id)){ if(!is.null(args$web_history)){ stop(msg, call.=FALSE) } if (length(args$id) == 0){ stop("Vector of IDs to send to NCBI is empty, perhaps entrez_search or entrez_link found no hits?", call.=FALSE) } return(list(id=args$id)) } if(is.null(args$web_history)){ stop(msg, call.=FALSE) } list(WebEnv=args$web_history$WebEnv, query_key=args$web_history$QueryKey) } entrez_check <- function(req){ if (req$status_code < 400) { return(invisible()) } if (req$status_code == 414){ stop("HTTP failure 414, the request is too large. For large requests, try using web history as described in the rentrez tutorial") } if (req$status_code == 502){ stop("HTTP failure: 502, bad gateway. This error code is often returned when trying to download many records in a single request. Try using web history as described in the rentrez tutorial") } message <- httr::content(req, as="text", encoding="UTF-8") stop("HTTP failure: ", req$status_code, "\n", message, call. = FALSE) } #Does a parsed-xml object contains ERRORs as reported by NCBI #(i.e. entry's in a valid XML): check_xml_errors <- function(x){ errs <- x["//ERROR"] if( length(errs) > 0){ for(e in errs){ warning(xmlValue(e)) } } invisible() } parse_response <- function(x, type=NULL){ res <- switch(type, "json" = fromJSON(x), "xml" = xmlTreeParse(x, useInternalNodes=TRUE), "native" = xmlTreeParse(x, useInternalNodes=TRUE), "gbc" = xmlTreeParse(x, useInternalNodes=TRUE), "ipg" = xmlTreeParse(x, useInternalNodes=TRUE), "text" = x, #citmatch uses plain old plain text x #fall-through, if in doubt, return un-parsed response ) return(res) } #contsructor for web history objects web_history <- function(WebEnv, QueryKey){ res <- list(WebEnv=WebEnv, QueryKey=QueryKey) class(res) <- list("web_history", "list") res } #'@export print.web_history <- function(x, ...){ cat("Web history object (QueryKey = ", x$QueryKey, ", WebEnv = ", substr(x$WebEnv, 1, 12), "...", ")\n",sep="") } add_class <- function(x, new_class){ class(x) <- c(new_class, class(x)) x } .last <- function(s){ len <- nchar(s) substr(s, len-1, len) } rentrez/R/entrez_citmatch.r0000644000176200001440000000342313017107141015507 0ustar liggesusers#' Fetch pubmed ids matching specially formatted citation strings #' #'@param bdata character, containing citation data. #' Each citation must be represented in a pipe-delimited format #' journal_title|year|volume|first_page|author_name|your_key| #' The final field "your_key" is arbitrary, and can used as you see #' fit. Fields can be left empty, but be sure to keep 6 pipes. #'@param db character, the database to search. Defaults to pubmed, #' the only database currently available #'@param retmode character, file format to retrieve. Defaults to xml, as #' per the API documentation, though note the API only returns plain text #'@param config vector configuration options passed to httr::GET #'@return A character vector containing PMIDs #'@seealso \code{\link[httr]{config}} for available configs #'@export #'@examples #'\donttest{ #' ex_cites <- c("proc natl acad sci u s a|1991|88|3248|mann bj|test1|", #' "science|1987|235|182|palmenberg ac|test2|") #' entrez_citmatch(ex_cites) #'} entrez_citmatch <- function(bdata, db="pubmed", retmode="xml", config=NULL){ if(length(bdata) > 1){ bdata <- paste0(bdata, collapse="\r") } ifelse(.last(bdata)=="|", bdata, paste0(bdata, "|")) request <- make_entrez_query("ecitmatch", bdata=bdata, db=db, retmode=retmode, interface=".cgi?", config=config) results <- strsplit(strsplit(request, "\n")[[1]], "\\|") sapply(results, extract_pmid) } extract_pmid <- function(line){ tryCatch("[["(line,7), error=function(e){ warning(paste("No pmid found for line", line)) NA } ) } rentrez/R/parse_pubmed_xml.r0000755000176200001440000000642113017107141015656 0ustar liggesusers#' Summarize an XML record from pubmed. #' #' Note: this function assumes all records are of the type "PubmedArticle" #' and will return an empty record for any other type (including books). #' #'@export #'@param record Either and XMLInternalDocument or character the record to be #'parsed ( expected to come from \code{\link{entrez_fetch}}) #'@return Either a single pubmed_record object, or a list of several #'@importFrom XML xmlName #'@examples #' #' hox_paper <- entrez_search(db="pubmed", term="10.1038/nature08789[doi]") #' hox_rel <- entrez_link(db="pubmed", dbfrom="pubmed", id=hox_paper$ids) #' recs <- entrez_fetch(db="pubmed", #' id=hox_rel$links$pubmed_pubmed[1:3], #' rettype="xml") #' parse_pubmed_xml(recs) #' parse_pubmed_xml<- function(record){ if(typeof(record) == "character"){ record <- xmlTreeParse(record, useInternalNodes=TRUE) } res <- xpathApply(record, "/PubmedArticleSet/*", parse_one_pubmed) if(length(res)==1){ return(res[[1]]) } class(res) <- c("multi_pubmed_record", "list") return(res) } #The work-horse function - get information from a single xml rec parse_one_pubmed <- function(paper){ atype <- xmlName(paper) if( atype != "PubmedArticle" ){ pmid = xpathSApply(paper, "//PMID", xmlValue) msg = paste0("Pubmed record ", pmid, " is of type '", atype, "' which rentrez doesn't know how to parse.", " Returning empty record") warning(msg) return(structure(list(), class="pubmed_record", empty=TRUE)) } get_value <- function(path){ return(xpathSApply(paper, path, xmlValue)) } res <- list() res$title <- get_value(".//ArticleTitle") res$authors <- paste(get_value(".//Author/LastName"), get_value(".//Author/ForeName"), sep=", ") res$year <- get_value(".//PubDate/Year") res$journal <- get_value(".//Journal/Title") res$volume <- get_value(".//JournalIssue/Volume") res$issue <- get_value(".//JournalIssue/Issue") res$pages <- get_value(".//MedlinePgn") res$key_words <- get_value(".//DescriptorName") res$doi <- get_value(".//ArticleId[@IdType='doi']") res$pmid <- get_value(".//ArticleId[@IdType='pubmed']") res$abstract <- get_value(".//AbstractText") structure(res, class="pubmed_record", empty=FALSE) } #' @export print.pubmed_record <- function(x, first_line=TRUE, ...){ if( attr(x, "empty")){ cat('Pubmed record (empty)\n') return() } if(length(x$authors) == 1){ display.author <- x$authors[1] } else if(length(x$authors) == 2){ display.author <- with(x, paste(authors[1], authors[2], sep=". & ")) } else display.author <- paste(x$authors[1], "et al") display <- with(x, sprintf(" %s. (%s). %s. %s:%s", display.author, year, journal, volume, pages)) if(first_line){ cat("Pubmed record", "\n") } cat(display, "\n") } #' @export print.multi_pubmed_record <- function(x, ...){ nrecs <- length(x) cat("List of", nrecs, "pubmed records\n") if( nrecs > 3){ sapply(x[1:3], print, first_line=FALSE) cat(".\n.\n.\n") } else sapply(x[1:3], print, first_line=FALSE) } rentrez/R/api_keys.r0000644000176200001440000000153613237161662014147 0ustar liggesusers# Handle NCBI API keys #' Set the ENTREZ_KEY variable to be used by all rentrez functions #' #' The NCBI allows users to access more records (10 per second) if they #' register for and use an API key. This function allows users to set this key #' for all calls to rentrez functions during a particular R session. See the #' vignette section "Using API keys" for a detailed description. #'@export #'@param key character. Value to set ENTREZ_KEY to (i.e. your API key). #'@return A logical of length one, TRUE is the value was set FALSE if not. #' value is returned inside invisible(), i.e. it is not printed to screen #' when the function is called. set_entrez_key <- function(key){ Sys.setenv(ENTREZ_KEY=key) } #internal function, used to test existence of key. is_entrez_key_set <- function(){ !identical(Sys.getenv('ENTREZ_KEY'), "") } rentrez/R/entrez_search.r0000755000176200001440000001367313240132621015172 0ustar liggesusers#' Search the NCBI databases using EUtils #' #' Search a given NCBI database with a particular query. #' #' The NCBI uses a search term syntax where search terms can be associated with #' a specific search field with square brackets. So, for instance ``Homo[ORGN]'' #' denotes a search for Homo in the ``Organism'' field. The names and #' definitions of these fields can be identified using #' \code{\link{entrez_db_searchable}}. #' #' Searches can make use of several fields by combining them via the boolean #' operators AND, OR and NOT. So, using the search term``((Homo[ORGN] AND APP[GENE]) NOT #' Review[PTYP])'' in PubMed would identify articles matching the gene APP in #' humans, and exclude review articles. More examples of the use of these search #' terms, and the more specific MeSH terms for precise searching, #' is given in the package vignette. \code{rentrez} handles special characters #' and URL encoding (e.g. replacing spaces with plus signs) on the client side, #' so there is no need to include these in search term #' #' The\code{rentrez} tutorial provides some tips on how to make the most of #' searches to the NCBI. In particular, the sections on uses of the "Filter" #' field and MeSH terms may in formulating precise searches. #' #'@export #'@param db character, name of the database to search for. #'@param term character, the search term. The syntax used in making these #'searches is described in the Details of this help message, the package #'vignette and reference given below. #'@param use_history logical. If TRUE return a web_history object for use in #' later calls to the NCBI #'@param retmode character, one of json (default) or xml. This will make no #' difference in most cases. #'@param \dots character, additional terms to add to the request, see NCBI #'documentation linked to in references for a complete list #'@param config vector configuration options passed to httr::GET #'@seealso \code{\link[httr]{config}} for available httr configurations #'@seealso \code{\link{entrez_db_searchable}} to get a set of search fields that #' can be used in \code{term} for any database #'@return ids integer Unique IDS returned by the search #'@return count integer Total number of hits for the search #'@return retmax integer Maximum number of hits returned by the search #'@return web_history A web_history object for use in subsequent calls to NCBI #'@return QueryTranslation character, search term as the NCBI interpreted it #'@return file either and XMLInternalDocument xml file resulting from search, parsed with #'\code{\link[XML]{xmlTreeParse}} or, if \code{retmode} was set to json a list #' resulting from the returned JSON file being parsed with #' \code{\link[jsonlite]{fromJSON}}. #'@references \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_} #'@examples #' \dontrun{ #' query <- "Gastropoda[Organism] AND COI[Gene]" #' web_env_search <- entrez_search(db="nuccore", query, use_history=TRUE) #' cookie <- web_env_search$WebEnv #' qk <- web_env_search$QueryKey #' snail_coi <- entrez_fetch(db = "nuccore", WebEnv = cookie, query_key = qk, #' file_format = "fasta", retmax = 10) #'} #'\donttest{ #' #' fly_id <- entrez_search(db="taxonomy", term="Drosophila") #' #Oh, right. There is a genus and a subgenus name Drosophila... #' #how can we limit this search #' (tax_fields <- entrez_db_searchable("taxonomy")) #' #"RANK" loots promising #' tax_fields$RANK #' entrez_search(db="taxonomy", term="Drosophila & Genus[RANK]") #'} entrez_search <- function(db, term, config=NULL, retmode="xml", use_history=FALSE, ... ){ usehistory <- if(use_history) "y" else "n" response <- make_entrez_query("esearch", db=db, term=term, config=config, retmode=retmode, usehistory=usehistory, ...) parsed <- parse_response(response, retmode) parse_esearch(parsed, history=use_history) } parse_esearch <- function(x, history) UseMethod("parse_esearch") parse_esearch.XMLInternalDocument <- function(x, history){ res <- list( ids = xpathSApply(x, "//IdList/Id", xmlValue), count = as.integer(xmlValue(x[["/eSearchResult/Count"]])), retmax = as.integer(xmlValue(x[["/eSearchResult/RetMax"]])), QueryTranslation = xmlValue(x[["/eSearchResult/QueryTranslation"]]), file = x) if(history){ res$web_history = web_history( QueryKey = xmlValue(x[["/eSearchResult/QueryKey"]]), WebEnv = xmlValue(x[["/eSearchResult/WebEnv"]]) ) } class(res) <- c("esearch", "list") return(res) } parse_esearch.list <- function(x, history){ #for consitancy between xml/json records we are going to change the #file names from lower -> CamelCase res <- x$esearchresult[ c("idlist", "count", "retmax", "querytranslation") ] names(res)[c(1,4)] <- c("ids", "QueryTranslation") if(history){ res$web_history = web_history(QueryKey = x$esearch_result[["querykey"]], WebEnv = x$esearch_result[["webenv"]]) } res$count <- as.integer(res$count) res$retmax <- as.integer(res$retmax) res$file <- x class(res) <- c("esearch", "list") return(res) } #'@export print.esearch <- function(x, ...){ display_term <- if(nchar(x$QueryTranslation) > 50){ paste(substr(x$QueryTranslation, 1, 50), "...") } else x$QueryTranslation cookie_word <- if("web_history" %in% names(x)) "a" else "no" msg<- paste("Entrez search result with", x$count, "hits (object contains", length(x$ids), "IDs and", cookie_word, "web_history object)\n Search term (as translated): " , display_term, "\n") cat(msg) } c("//IdList/Id", "/eSearchResult/Count", "/eSearchResult/RetMax", "/eSearchResult/QueryTranslation") rentrez/R/entrez_info.r0000644000176200001440000001415513017107141014652 0ustar liggesusers#' Get information about EUtils databases #' #' Gather information about EUtils generally, or a given Eutils database. #'Note: The most common uses-cases for the einfo util are finding the list of #' search fields available for a given database or the other NCBI databases to #' which records in a given database might be linked. Both these use cases #' are implemented in higher-level functions that return just this information #' (\code{entrez_db_searchable} and \code{entrez_db_links} respectively). #' Consequently most users will not have a reason to use this function (though #' it is exported by \code{rentrez} for the sake of completeness. #'@param db character database about which to retrieve information (optional) #'@param config config vector passed on to \code{httr::GET} #'@return XMLInternalDocument with information describing either all the #'databases available in Eutils (if db is not set) or one particular database #'(set by 'db') #'@seealso \code{\link[httr]{config}} for available httr configurations #'@family einfo #'@importFrom XML xmlChildren xmlName xpathSApply #'@examples #'\dontrun{ #'all_the_data <- entrez_info() #'XML::xpathSApply(all_the_data, "//DbName", xmlValue) #'entrez_dbs() #'} #'@export entrez_info <- function(db=NULL, config=NULL){ req <- make_entrez_query("einfo", db=db, config=config) res <- parse_response(req, "xml") check_xml_errors(res) res } #' List databases available from the NCBI #' #' Retrieves the names of databases available through the EUtils API #'@param config config vector passed to \code{httr::GET} #'@family einfo #'@return character vector listing available dbs #'@export #'@examples #'\donttest{ #' entrez_dbs() #'} entrez_dbs <- function(config=NULL){ xpathSApply(entrez_info(config), "//DbName", xmlValue) } #' Retrieve summary information about an NCBI database #' #'@param config config vector passed to \code{httr::GET} #'@param db character, name of database to summaries #'@return Character vector with the following data #'@return DbName Name of database #'@return Description Brief description of the database #'@return Count Number of records contained in the database #'@return MenuName Name in web-interface to EUtils #'@return DbBuild Unique ID for current build of database #'@return LastUpdate Date of most recent update to database #'@family einfo #'@examples #'entrez_db_summary("pubmed") #'@export entrez_db_summary <- function(db, config=NULL){ rec <- entrez_info(db, config) unparsed <- xpathApply( rec, "//DbInfo/*[not(self::LinkList or self::FieldList)]") res <- sapply(unparsed, xmlValue) names(res) <- sapply(unparsed, xmlName) class(res) <- c("eInfoEntry", class(res)) res } #' List available links for records from a given NCBI database #' #' For a given database, fetch a list of other databases that contain #' cross-referenced records. The names of these records can be used as the #' \code{db} argument in \code{\link{entrez_link}} #' #'@param config config vector passed to \code{httr::GET} #'@param db character, name of database to search #'@return An eInfoLink object (sub-classed from list) summarizing linked-databases. #' Can be coerced to a data-frame with \code{as.data.frame}. Printing the object #' the name of each element (which is the correct name for \code{entrez_link}, #' and can be used to get (a little) more information about each linked database #' (see example below). #'@family einfo #'@seealso \code{\link{entrez_link}} #'@examples #' \donttest{ #'taxid <- entrez_search(db="taxonomy", term="Osmeriformes")$ids #'tax_links <- entrez_db_links("taxonomy") #'tax_links #'entrez_link(dbfrom="taxonomy", db="pmc", id=taxid) #' #'sra_links <- entrez_db_links("sra") #'as.data.frame(sra_links) #'} #'@export entrez_db_links <- function(db, config=NULL){ rec <- entrez_info(db, config) unparsed <- xpathApply(rec, "//Link", xmlChildren) res <- lapply(unparsed, lapply, xmlValue) res <- lapply(res, add_class, new_class='eInfoEntry') names(res) <- sapply(res, "[[", "DbTo") class(res) <- c("eInfoLink", "eInfoList", "list") attr(res, 'db') <- xmlValue(rec["/eInfoResult/DbInfo/DbName"][[1]]) res } #' List available search fields for a given database #' #'Fetch a list of search fields that can be used with a given database. Fields #' can be used as part of the \code{term} argument to \code{\link{entrez_search}} #'@param config config vector passed to \code{httr::GET} #'@param db character, name of database to get search field from #'@return An eInfoSearch object (subclassed from list) summarizing linked-databases. #' Can be coerced to a data-frame with \code{as.data.frame}. Printing the object #' shows only the names of each available search field. #'@seealso \code{\link{entrez_search}} #'@family einfo #'@examples #'\donttest{ #' pmc_fields <- entrez_db_searchable("pmc") #' pmc_fields[["AFFL"]] #' entrez_search(db="pmc", term="Otago[AFFL]", retmax=0) #' entrez_search(db="pmc", term="Auckland[AFFL]", retmax=0) #' #' sra_fields <- entrez_db_searchable("sra") #' as.data.frame(sra_fields) #'} #'@export entrez_db_searchable <- function(db, config=NULL){ rec <- entrez_info(db, config) unparsed <- xpathApply(rec, "/eInfoResult/DbInfo/FieldList/Field", xmlChildren) res <- lapply(unparsed, lapply, xmlValue) res <- lapply(res, add_class, new_class="eInfoEntry") names(res) <- sapply(res, "[[", "Name") class(res) <- c("eInfoSearch", "eInfoList", "list") attr(res, 'db') <- xmlValue(rec["/eInfoResult/DbInfo/DbName"][[1]]) res } #'@export print.eInfoLink<- function(x, ...){ cat("Databases with linked records for database '", attr(x, "db"), "'\n", sep="") print(names(x), quote=FALSE) } #'@export as.data.frame.eInfoList <- function(x, ...){ data.frame(do.call("rbind", x), row.names=NULL) } #'@export print.eInfoSearch <- function(x, ...){ cat("Searchable fields for database '", attr(x, "db"), "'\n", sep="") for (term in x){ cat(" ", term$Name, "\t", term$Description, "\n") } } #'@export print.eInfoEntry <- function(x, ...){ cat(paste0(" ", names(x), ": ", unlist(x), collapse="\n"), "\n") } rentrez/R/help.r0000755000176200001440000000162113017107141013255 0ustar liggesusers#' rentrez #' #' rentrez provides functions to search for, discover and download data from #' the NCBI's databases using their EUtils function. #' #' Users are expected to know a little bit about the EUtils API, which is well #' documented: \url{http://www.ncbi.nlm.nih.gov/books/NBK25500/} #' #' The NCBI will ban IPs that don't use EUtils within their \href{http://www.ncbi.nlm.nih.gov/corehtml/query/static/eutils_help.html}{user guidelines}. In particular #' /enumerated{ #' /item Don't send more than three request per second (rentrez enforces this limit) #' /item If you plan on sending a sequence of more than ~100 requests, do so outside of peak times for the US #' /item For large requests use the web history method (see examples for \code{\link{entrez_search}} or use \code{\link{entrez_post}} to upload IDs) #'} #' @docType package #' @name rentrez #' @aliases rentrez rentrez-package #' NULL rentrez/R/entrez_link.r0000755000176200001440000002225513240132354014661 0ustar liggesusers#' Get links to datasets related to records from an NCBI database #' #' Discover records related to a set of unique identifiers from #' an NCBI database. The object returned by this function depends on the value #' set for the \code{cmd} argument. Printing the returned object lists the names #' , and provides a brief description, of the elements included in the object. #' #'@export #'@param db character Name of the database to search for links (or use "all" to #' search all databases available for \code{db}. \code{entrez_db_links} allows you #' to discover databases that might have linked information (see examples). #'@param id vector with unique ID(s) for records in database \code{db}. #'@param web_history a web_history object #'@param dbfrom character Name of database from which the Id(s) originate #'@param by_id logical If FALSE (default) return a single #' \code{elink} objects containing links for all of the provided \code{id}s. #' Alternatively, if TRUE return a list of \code{elink} objects, one for each #' ID in \code{id}. #'@param cmd link function to use. Allowed values include #' \itemize{ #' \item neighbor (default). Returns a set of IDs in \code{db} linked to the #' input IDs in \code{dbfrom}. #' \item neighbor_score. As `neighbor'', but additionally returns similarity scores. #' \item neighbor_history. As `neighbor', but returns web history objects. #' \item acheck. Returns a list of linked databases available from NCBI for a set of IDs. #' \item ncheck. Checks for the existence of links within a single database. #' \item lcheck. Checks for external (i.e. outside NCBI) links. #' \item llinks. Returns a list of external links for each ID, excluding links #' provided by libraries. #' \item llinkslib. As 'llinks' but additionally includes links provided by #' libraries. #' \item prlinks. As 'llinks' but returns only the primary external link for #' each ID. #'} #'@param \dots character Additional terms to add to the request, see NCBI #'documentation linked to in references for a complete list #'@param config vector configuration options passed to httr::GET #'@seealso \code{\link[httr]{config}} for available configs #'@seealso \code{entrez_db_links} #'@return An elink object containing the data defined by the \code{cmd} argument #'(if by_id=FALSE) or a list of such object (if by_id=TRUE). #'@return file XMLInternalDocument xml file resulting from search, parsed with #'\code{\link{xmlTreeParse}} #'@references \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ELink_} #'@importFrom XML xmlToList #' @examples #' \donttest{ #' pubmed_search <- entrez_search(db = "pubmed", term ="10.1016/j.ympev.2010.07.013[doi]") #' linked_dbs <- entrez_db_links("pubmed") #' linked_dbs #' nucleotide_data <- entrez_link(dbfrom = "pubmed", id = pubmed_search$ids, db ="nuccore") #' #Sources for the full text of the paper #' res <- entrez_link(dbfrom="pubmed", db="", cmd="llinks", id=pubmed_search$ids) #' linkout_urls(res) #'} #' entrez_link <- function(dbfrom, web_history=NULL, id=NULL, db=NULL, cmd='neighbor', by_id=FALSE, config=NULL, ...){ identifiers <- id_or_webenv() args <- c(list("elink", db=db, dbfrom=dbfrom, cmd=cmd, config=config, by_id=by_id, ...), identifiers) if(by_id){ if(is.null(id)) stop("Can't use by_id mode without ids!") } response <- do.call(make_entrez_query,args) record <- parse_response(response, 'xml') res <- parse_elink(record, cmd=cmd, by_id=by_id) if(!is.null(id) & by_id){ if(length(res) != length(id)){ msg <- paste( id[!(id %in% res)], ", ") warning("Some IDs appear to be invalid. Result containg no information for the following IDs: ", msg) } } res } #' Extract URLs from an elink object #' @param elink elink object (returned by entrez_link) containing Urls #' @return list of character vectors, one per ID each containing of URLs for that #' ID. #' @seealso entrez_link #' @export linkout_urls <- function(elink){ if (!("linkouts" %in% names(elink))){ stop("Not linkouts in the elink object. Use entrez_link commands 'prlinks', 'llinks' or 'llinkslib' to fetch urls") } lapply(elink$linkouts, function(lo) if(length(lo) == 0) NA else sapply(lo, "[[", "Url")) } # # Parising Elink is.... fun. The XML files returned by the different 'cmd' # args are very differnt, so we can't hope for a one-size-fits all solution. # Instead, we can break of a few similar cases and write parsing functions, # which we dispatch via a big switch statement. # # Each parsing function should return a list with elements corresponding to the # data n XML, and set the attribute "content" to a brief description of what # each element in the record contains, to be used by the print fxn. # # In addition, the "by_id" mode # means we we sometimes reuturn a list of elink objects, have applied the # relevant function to each "" in the XML. # parse_elink <- function(x, cmd, by_id, id){ check_xml_errors(x) f <- make_elink_fxn(cmd) res <- xpathApply(x, "//LinkSet",f) if(length(res) > 1){ class(res) <- c("elink_list", "list") return(res) } res[[1]] } make_elink_fxn <- function(cmd){ f <- switch(cmd, "neighbor" = parse_neighbors, "neighbor_score" = function(x) parse_neighbors(x, scores=TRUE), "neighbor_history" = parse_history, "acheck" = parse_acheck, "ncheck" = function(x) parse_check(x, "HasNeighbor"), "lcheck" = function(x) parse_check(x, "HasLinkOut"), "llinkslib" = parse_linkouts, "llinks" = parse_linkouts, "prlinks" = parse_linkouts, stop("Don't know how to deal with cmd ", cmd) ) function(x){ res <- f(x) class(res) <- c("elink", "list") res } } parse_neighbors <- function(x, scores=FALSE){ content <- "" if("-1" %in% xpathSApply(x, "//IdList/Id", xmlValue)){ warning("Some IDs not found") } db_names <- xpathSApply(x, "LinkSetDb/LinkName", xmlValue) links <- sapply(db_names, get_linked_elements, record=x, element="Id", simplify=FALSE) class(links) <- c("elink_classic", "list") res <- list(links = links, file=x) if(scores){ nscores <- sapply(db_names, get_linked_elements, record=x, element="Score", simplify=FALSE) class(nscores) <- c("elink_classic", "list") content <- " $scores: weighted neighbouring scores for each hit in links\n" res$scores <- nscores } attr(res, "content") <- paste(" $links: IDs for linked records from NCBI\n", content) res } parse_history <- function(x){ qks <- xpathSApply(x, "LinkSetDbHistory/QueryKey", xmlValue, simplify=FALSE) cookie <- xmlValue(x[["WebEnv"]]) histories <- lapply(qks, web_history, WebEnv=cookie) names(histories) <- xpathSApply(x, "//LinkSetDbHistory/LinkName", xmlValue) res <- list(web_histories=histories, file=x) attr(res, "content") <- paste0(" $web_histories: Objects containing web history information\n") res } parse_acheck <- function(x){ db_info <- xpathApply(x, "//LinkInfo", xmlToList) names(db_info) <- sapply(db_info, "[[","LinkName") class(db_info) <- "elink_classic" res <- list(linked_databses = db_info) attr(res, "content") <- " $linked_databases: a list of summary data from each databse with linked records" res } parse_check <- function(x, attr){ path <- paste0("IdCheckList/Id/@", attr) is_it_y <- structure(names= xpathSApply(x, "IdCheckList/Id", xmlValue), xpathSApply(x, path, `==`, "Y")) res <- list(check = is_it_y) attr(res, "content") <- " $check: TRUE/FALSE for wether each ID has links" res } parse_linkouts <- function(x){ per_id <- xpathApply(x, "//IdUrlList/IdUrlSet") list_per_id <- lapply(per_id, function(x) lapply(x["ObjUrl"], xmlToList)) names(list_per_id) <-paste0("ID_", sapply(per_id,function(x) xmlValue(x[["Id"]]))) list_o_lists <- lapply(list_per_id, unname)#otherwise first element of earch list has same name! list_o_lists <- lapply(list_o_lists, lapply, add_class, "linkout") res <- list( linkouts = list_o_lists) attr(res, "content") <- " $linkouts: links to external websites" res } #' @export print.elink_list <- function(x, ...){ payload <- attr(x[[1]], "content") cat("List of", length(x), "elink objects,each containing\n", payload) } #' @export print.elink <- function(x, ...){ payload <- attr(x, "content") cat("elink object with contents:\n", payload, "\n",sep="") } #' @export print.linkout <- function(x,...){ cat("Linkout from", x$Provider$Name, "\n $Url:", substr(x$Url, 1, 26), "...\n") } #' @export print.elink_classic <- function(x, ...){ len <- length(x) cat(paste("elink result with information from", len , "databases:\n")) print (names(x), quote=FALSE) } get_linked_elements <- function(record, dbname, element){ path <- paste0("LinkSetDb/LinkName[text()='", dbname, "']/../Link/", element) return(xpathSApply(record, path, xmlValue)) } rentrez/R/entrez_fetch.r0000755000176200001440000000656513237161662015035 0ustar liggesusers#' Download data from NCBI databases #' #' A set of unique identifiers mush be specified with either the \code{db} #' argument (which directly specifies the IDs as a numeric or character vector) #' or a \code{web_history} object as returned by #' \code{\link{entrez_link}}, \code{\link{entrez_search}} or #' \code{\link{entrez_post}}. See #' \href{https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/}{Table 1} #' in the linked reference for the set of #' formats available for each database. In particular, note that sequence #' databases (nuccore, protein and their relatives) use specific format names #' (eg "native", "ipg") for different flavours of xml. #' #' For the most part, this function returns a character vector containing the #' fetched records. For XML records (including 'native', 'ipg', 'gbc' sequence #' records), setting \code{parsed} to \code{TRUE} will return an #' \code{XMLInternalDocument}, #' #'@export #'@param db character, name of the database to use #'@param id vector (numeric or character), unique ID(s) for records in database #'\code{db}. In the case of sequence databases these IDs can take form of an #' NCBI accession followed by a version number (eg AF123456.1 or AF123456.2). #'@param web_history, a web_history object #'@param rettype character, format in which to get data (eg, fasta, xml...) #'@param retmode character, mode in which to receive data, defaults to 'text' #'@param config vector, httr configuration options passed to httr::GET #'@param \dots character, additional terms to add to the request, see NCBI #'documentation linked to in references for a complete list #'@references \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_EFetch_} #'@param parsed boolean should entrez_fetch attempt to parse the resulting #' file. Only works with xml records (including those with rettypes other than #' "xml") at present #'@seealso \code{\link[httr]{config}} for available '\code{httr}` configs #'@return character string containing the file created #'@return XMLInternalDocument a parsed XML document if parsed=TRUE and #'rettype is a flavour of XML. # #' @examples #' \dontrun{ #' katipo <- "Latrodectus katipo[Organism]" #' katipo_search <- entrez_search(db="nuccore", term=katipo) #' kaitpo_seqs <- entrez_fetch(db="nuccore", id=katipo_search$ids, rettype="fasta") #' #xml #' kaitpo_seqs <- entrez_fetch(db="nuccore", id=katipo_search$ids, rettype="native") #'} entrez_fetch <- function(db, id=NULL, web_history=NULL, rettype, retmode="", parsed=FALSE, config=NULL, ...){ identifiers <- id_or_webenv() if(parsed){ if(!is_xml_record(rettype, retmode)){ msg <- paste("At present, entrez_fetch can only parse XML records, got", rettype) stop(msg) } } args <- c(list("efetch", db=db, rettype=rettype, config=config, ...), identifiers) records <- do.call(make_entrez_query, args) if(parsed){ #At the moment, this is just a long-winded way to call #XML::xmlTreeParse, but we already use this approach to parse #esummaries,and this is more flexible if NCBI starts sharing more #records in JSON. return(parse_response(records, rettype)) } records } is_xml_record <- function(rettype, retmode){ if(rettype %in% c("xml", "native", "gpc","ipg")){ return(TRUE) } retmode == "xml" } rentrez/R/entrez_summary.r0000755000176200001440000002054413111437373015426 0ustar liggesusers#' Get summaries of objects in NCBI datasets from a unique ID # #' #' The NCBI offer two distinct formats for summary documents. #' Version 1.0 is a relatively limited summary of a database record based on a #' shared Document Type Definition. Version 1.0 summaries are only available as #' XML and are not available for some newer databases #' Version 2.0 summaries generally contain more information about a given #' record, but each database has its own distinct format. 2.0 summaries are #' available for records in all databases and as JSON and XML files. #' As of version 0.4, rentrez fetches version 2.0 summaries by default and #' uses JSON as the exchange format (as JSON object can be more easily converted #' into native R types). Existing scripts which relied on the structure and #' naming of the "Version 1.0" summary files can be updated by setting the new #' \code{version} argument to "1.0". #' #' By default, entrez_summary returns a single record when only one ID is #' passed and a list of such records when multiple IDs are passed. This can lead #' to unexpected behaviour when the results of a variable number of IDs (perhaps the #' result of \code{entrez_search}) are processed with an apply family function #' or in a for-loop. If you use this function as part of a function or script that #' generates a variably-sized vector of IDs setting \code{always_return_list} to #' \code{TRUE} will avoid these problems. The function #' \code{extract_from_esummary} is provided for the specific case of extracting #' named elements from a list of esummary objects, and is designed to work on #' single objects as well as lists. #' #'@export #'@param db character Name of the database to search for #'@param id vector with unique ID(s) for records in database \code{db}. #' In the case of sequence databases these IDs can take form of an #' NCBI accession followed by a version number (eg AF123456.1 or AF123456.2) #'@param web_history A web_history object #'@param always_return_list logical, return a list of esummary objects even #'when only one ID is provided (see description for a note about this option) #'@param \dots character Additional terms to add to the request, see NCBI #'documentation linked to in references for a complete list #'@param config vector configuration options passed to \code{httr::GET} #'@param version either 1.0 or 2.0 see above for description #'@param retmode either "xml" or "json". By default, xml will be used for #'version 1.0 records, json for version 2.0. #'@references \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESummary_} #'@seealso \code{\link[httr]{config}} for available configs #'@seealso \code{\link{extract_from_esummary}} which can be used to extract #'elements from a list of esummary records #'@return A list of esummary records (if multiple IDs are passed and #'always_return_list if FALSE) or a single record. #'@return file XMLInternalDocument xml file containing the entire record #'returned by the NCBI. #'@importFrom XML xpathApply xmlSApply xmlGetAttr xmlValue #'@importFrom jsonlite fromJSON #' @examples #'\donttest{ #' pop_ids = c("307082412", "307075396", "307075338", "307075274") #' pop_summ <- entrez_summary(db="popset", id=pop_ids) #' extract_from_esummary(pop_summ, "title") #' #' # clinvar example #' res <- entrez_search(db = "clinvar", term = "BRCA1", retmax=10) #' cv <- entrez_summary(db="clinvar", id=res$ids) #' cv #' extract_from_esummary(cv, "title", simplify=FALSE) #' extract_from_esummary(cv, "trait_set")[1:2] #' extract_from_esummary(cv, "gene_sort") #' } entrez_summary <- function(db, id=NULL, web_history=NULL, version=c("2.0", "1.0"), always_return_list = FALSE, retmode=NULL, config=NULL, ...){ identifiers <- id_or_webenv() v <-match.arg(version) if( is.null(retmode)) { retmode <- if( v == "1.0" ) "xml" else "json" } if (retmode == "json" & v == "1.0"){ stop("Version 1.0 records are only available as xml, not json") } args <- c(list("esummary", db=db, config=config, retmode=retmode, version=v, ...), identifiers) response <- do.call(make_entrez_query, args) whole_record <- parse_response(response, retmode) parse_esummary(whole_record, v, always_return_list) } #' Extract elements from a list of esummary records #'@export #'@param esummaries A list of esummary objects #'@param elements the names of the element to extract #'@param simplify logical, if possible return a vector #'@return List or vector containing requested elements extract_from_esummary <- function(esummaries, elements, simplify=TRUE){ UseMethod("extract_from_esummary", esummaries) } #'@export extract_from_esummary.esummary <- function(esummaries, elements, simplify=TRUE){ fxn <- if(simplify & length(elements)==1) `[[` else `[` fxn(esummaries, elements) } #'@export extract_from_esummary.esummary_list <- function(esummaries, elements, simplify=TRUE){ fxn <- if (simplify & length(elements) == 1) `[[` else `[` sapply(esummaries, fxn, elements, simplify=simplify) } parse_esummary <- function(x, version, always_return_list) UseMethod("parse_esummary") check_json_errs <- function(rec){ if("error" %in% names(rec)){ msg <- paste0("ID ", rec$uid, " produced error '", rec$error, "'") warning(msg, call.=FALSE) } invisible() } parse_esummary.list <- function(x, version, always_return_list){ #already parsed by jsonlite, just add check for errors, then re-class #First make sure the file doesn't have an error at the root if(!is.null(x[["error"]])){ warning("Esummary includes error message: ", x[["error"]], call.=FALSE) } res <- x$result[-1] #remove UIDs from result (they are already names of sub-elements) # Make sure there are some records in this file if(length(res) == 0){ stop("No esummary records found in file", call.=FALSE) } #Finally check for errors _within_ each recods sapply(res, check_json_errs) #OK: all clear, return the records res <- lapply(res, add_class, new_class="esummary") if(length(res)==1 & !always_return_list){ return(res[[1]]) } class(res) <- c("esummary_list", "list") res } # Prase a summary XML # # Logic goes like this # 1. Define functions parse_esumm_* to handle all data types # 2. For each node detect type, parse accordingly # 3. wrap it all up in function parse_summary that # # #@export parse_esummary.XMLInternalDocument <- function(x, version, always_return_list){ check_xml_errors(x) #Version 2.0 records have no type information (int, list etc) so we # can onyl return them as characters if(version == "2.0"){ res <- lapply(x["//DocumentSummary"], xmlToList) res <- lapply(res, add_class, "esummary") names(res) <- sapply(res, function(x) x[[".attrs"]]["uid"]) } else{ recs <- x["//DocSum"] if(length(recs)==0){ stop("Esummary document contains no DocSums, try 'version=2.0'?)") } per_rec <- function(r){ res <- xpathApply(r, "Item", parse_node) names(res) <- xpathApply(r, "Item", xmlGetAttr, "Name") res <- c(res, file=x) class(res) <- c("esummary", class(res)) return(res) } if(length(recs)==1 & !always_return_list){ return(per_rec(recs[[1]])) } res <- lapply(recs, per_rec) names(res) <- xpathSApply(x, "//DocSum/Id", xmlValue) } class(res) <- c("esummary_list", "list") res } parse_node <- function(node) { node_type <- xmlGetAttr(node, "Type") node_fxn <- switch(node_type, "Integer" = parse_esumm_int, "List" = parse_esumm_list, "Structure" = parse_esumm_list, xmlValue) #unnamed arguments to switch = default val. return(node_fxn(node)) } parse_esumm_int <- function(node) as.integer(xmlValue(node)) parse_esumm_list <- function(node){ res <- lapply(node["Item"], parse_node) names(res) <- lapply(node["Item"], xmlGetAttr, "Name") return(res) } #' @export print.esummary <- function(x, ...){ len <- length(x) cat(paste("esummary result with", len - 1, "items:\n")) print(names(x)[-len], quote=FALSE) } #' @export print.esummary_list <- function(x, ...){ len <- length(x) cat("List of ", len, "esummary records. First record:\n\n ") print(x[1]) } rentrez/R/entrez_post.r0000755000176200001440000000337713237161662014727 0ustar liggesusers#' Post IDs to Eutils for later use #' #' #' #'@export #'@param db character Name of the database from which the IDs were taken #'@param id vector with unique ID(s) for records in database \code{db}. #'@param web_history A web_history object. Can be used to add to additional #' identifiers to an existing web environment on the NCBI #'@param \dots character Additional terms to add to the request, see NCBI #'documentation linked to in references for a complete list #'@param config vector of configuration options passed to httr::GET #'@references \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_EPost_} #'@seealso \code{\link[httr]{config}} for available httr configurations #'@importFrom XML xmlTreeParse #' @examples #'\dontrun{ #' so_many_snails <- entrez_search(db="nuccore", #' "Gastropoda[Organism] AND COI[Gene]", retmax=200) #' upload <- entrez_post(db="nuccore", id=so_many_snails$ids) #' first <- entrez_fetch(db="nuccore", rettype="fasta", web_history=upload, #' retmax=10) #' second <- entrez_fetch(db="nuccore", file_format="fasta", web_history=upload, #' retstart=10, retmax=10) #'} entrez_post <- function(db, id=NULL, web_history=NULL, config=NULL, ...){ args <-list("epost", db=db, config=config, id=id, web_history=web_history, ...) if(!is.null(web_history)){ args <- c(args, WebEnv=web_history$WebEnv, query_key = web_history$QueryKey) args$web_history <- NULL } response <- do.call(make_entrez_query, args) record <- xmlTreeParse(response, useInternalNodes=TRUE) result <- xpathApply(record, "/ePostResult/*", XML::xmlValue) names(result) <- c("QueryKey", "WebEnv") class(result) <- c("web_history", "list") return(result) } rentrez/vignettes/0000755000176200001440000000000013240146763013761 5ustar liggesusersrentrez/vignettes/rentrez_tutorial.Rmd0000644000176200001440000006470013237161662020051 0ustar liggesusers--- title: Rentrez Tutorial author: "David winter" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true vignette: > %\VignetteIndexEntry{Rentrez Tutorial} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} --- ```{r, count_recs, echo=FALSE} library(rentrez) count_recs <- function(db, denom) { nrecs <- rentrez::entrez_db_summary(db)["Count"] round(as.integer(nrecs)/denom, 1) } ``` ## Introduction: The NCBI, entrez and `rentrez`. The NCBI shares a _lot_ of data. At the time this document was compiled, there were `r count_recs("pubmed",1e6)` million papers in [PubMed](http://www.ncbi.nlm.nih.gov/pubmed/), including `r count_recs("pmc", 1e6)` million full-text records available in [PubMed Central](http://www.ncbi.nlm.nih.gov/pubmed/). [The NCBI Nucleotide Database](http://www.ncbi.nlm.nih.gov/nuccore) (which includes GenBank) has data for `r count_recs("nuccore", 1e6)` million different sequences, and [dbSNP](http://www.ncbi.nlm.nih.gov/snp/) describes `r count_recs("snp", 1e6)` million different genetic variants. All of these records can be cross-referenced with the `r round(entrez_search(db="taxonomy", term='species[RANK]')$count/1e6,2)` million species in the [NCBI taxonomy](www.ncbi.nlm.nih.gov/taxonomy) or `r count_recs("omim", 1e3)` thousand disease-associated records in [OMIM](http://www.ncbi.nlm.nih.gov/omim). The NCBI makes this data available through a [web interface](http://www.ncbi.nlm.nih.gov/), an [FTP server](ftp://ftp.ncbi.nlm.nih.gov/) and through a REST API called the [Entrez Utilities](http://www.ncbi.nlm.nih.gov/books/NBK25500/) (`Eutils` for short). This package provides functions to use that API, allowing users to gather and combine data from multiple NCBI databases in the comfort of an R session or script. ## Getting started with the rentrez To make the most of all the data the NCBI shares you need to know a little about their databases, the records they contain and the ways you can find those records. The [NCBI provides extensive documentation for each of their databases](http://www.ncbi.nlm.nih.gov/home/documentation.shtml) and for the [EUtils API that `rentrez` takes advantage of](http://www.ncbi.nlm.nih.gov/books/NBK25501/). There are also some helper functions in `rentrez` that help users learn their way around the NCBI's databases. First, you can use `entrez_dbs()` to find the list of available databases: ```{r, dbs} entrez_dbs() ``` There is a set of functions with names starting `entrez_db_` that can be used to gather more information about each of these databases: **Functions that help you learn about NCBI databases** | Function name | Return | |--------------------------|------------------------------------------------------| | `entrez_db_summary()` | Brief description of what the database is | | `entrez_db_searchable()` | Set of search terms that can used with this database | | `entrez_db_links() ` | Set of databases that might contain linked records | For instance, we can get a description of the somewhat cryptically named database 'cdd'... ```{r, cdd} entrez_db_summary("cdd") ``` ... or find out which search terms can be used with the Sequence Read Archive (SRA) database (which contains raw data from sequencing projects): ```{r, sra_eg} entrez_db_searchable("sra") ``` Just how these 'helper' functions might be useful will become clearer once you've started using `rentrez`, so let's get started. ## Searching databases: `entrez_search()` Very often, the first thing you'll want to do with `rentrez` is search a given NCBI database to find records that match some keywords. You can do this using the function `entrez_search()`. In the simplest case you just need to provide a database name (`db`) and a search term (`term`) so let's search PubMed for articles about the `R language`: ```{r eg_search} r_search <- entrez_search(db="pubmed", term="R Language") ``` The object returned by a search acts like a list, and you can get a summary of its contents by printing it. ```{r print_search} r_search ``` There are a few things to note here. First, the NCBI's server has worked out that we meant R as a programming language, and so included the ['MeSH' term](http://www.ncbi.nlm.nih.gov/mesh) term associated with programming languages. We'll worry about MeSH terms and other special queries later, for now just note that you can use this feature to check that your search term was interpreted in the way you intended. Second, there are many more 'hits' for this search than there are unique IDs contained in this object. That's because the optional argument `retmax`, which controls the maximum number of returned values has a default value of 20. The IDs are the most important thing returned here. They allow us to fetch records matching those IDs, gather summary data about them or find cross-referenced records in other databases. We access the IDs as a vector using the `$` operator: ```{r search_ids} r_search$ids ``` If we want to get more than 20 IDs we can do so by increasing the `ret_max` argument. ```{r searchids_2} another_r_search <- entrez_search(db="pubmed", term="R Language", retmax=40) another_r_search ``` If we want to get IDs for all of the thousands of records that match this search, we can use the NCBI's web history feature [described below](#web_history). ### Building search terms The EUtils API uses a special syntax to build search terms. You can search a database against a specific term using the format `query[SEARCH FIELD]`, and combine multiple such searches using the boolean operators `AND`, `OR` and `NOT`. For instance, we can find next generation sequence datasets for the (amazing...) ciliate _Tetrahymena thermophila_ by using the organism ('ORGN') search field: ```{r, Tt} entrez_search(db="sra", term="Tetrahymena thermophila[ORGN]", retmax=0) ``` We can narrow our focus to only those records that have been added recently (using the colon to specify a range of values): ```{r, Tt2} entrez_search(db="sra", term="Tetrahymena thermophila[ORGN] AND 2013:2015[PDAT]", retmax=0) ``` Or include recent records for either _T. thermophila_ or it's close relative _T. borealis_ (using parentheses to make ANDs and ORs explicit). ```{r, Tt3} entrez_search(db="sra", term="(Tetrahymena thermophila[ORGN] OR Tetrahymena borealis[ORGN]) AND 2013:2015[PDAT]", retmax=0) ``` The set of search terms available varies between databases. You can get a list of available terms or any given data base with `entrez_db_searchable()` ```{r, sra_searchable} entrez_db_searchable("sra") ``` ### Using the Filter field "Filter" is a special field that, as the names suggests, allows you to limit records returned by a search to set of filtering criteria. There is no programmatic way to find the particular terms that can be used with the Filter field. However, the NCBI's website provides an "advanced search" tool for some databases that can be used to discover these terms. For example, to find the list of possible to find all of the terms that can be used to filter searches to the nucleotide database using the [advanced search for that databse](https://www.ncbi.nlm.nih.gov/nuccore/advanced). On that page selecting "Filter" from the first drop-down box then clicking "Show index list" will allow the user to scroll through possible filtering terms. ###Precise queries using MeSH terms In addition to the search terms described above, the NCBI allows searches using [Medical Subject Heading (MeSH)](http://www.ncbi.nlm.nih.gov/mesh) terms. These terms create a 'controlled vocabulary', and allow users to make very finely controlled queries of databases. For instance, if you were interested in reviewing studies on how a class of anti-malarial drugs called Folic Acid Antagonists work against _Plasmodium vivax_ (a particular species of malarial parasite), you could use this search: ```{r, mesh} entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") ``` The complete set of MeSH terms is available as a database from the NCBI. That means it is possible to download detailed information about each term and find the ways in which terms relate to each other using `rentrez`. You can search for specific terms with `entrez_search(db="mesh", term =...)` and learn about the results of your search using the tools described below. ### Advanced counting As you can see above, the object returned by `entrez_search()` includes the number of records matching a given search. This means you can learn a little about the composition of, or trends in, the records stored in the NCBI's databases using only the search utility. For instance, let's track the rise of the scientific buzzword "connectome" in PubMed, programmatically creating search terms for the `PDAT` field: ```{r, connectome, fig.width=5, fig.height=4, fig.align='center'} search_year <- function(year, term){ query <- paste(term, "AND (", year, "[PDAT])") entrez_search(db="pubmed", term=query, retmax=0)$count } year <- 2008:2014 papers <- sapply(year, search_year, term="Connectome", USE.NAMES=FALSE) plot(year, papers, type='b', main="The Rise of the Connectome") ``` ## Finding cross-references : `entrez_link()`: One of the strengths of the NCBI databases is the degree to which records of one type are connected to other records within the NCBI or to external data sources. The function `entrez_link()` allows users to discover these links between records. ###My god, it's full of links To get an idea of the degree to which records in the NCBI are cross-linked we can find all NCBI data associated with a single gene (in this case the Amyloid Beta Precursor gene, the product of which is associated with the plaques that form in the brains of Alzheimer's Disease patients). The function `entrez_link()` can be used to find cross-referenced records. In the most basic case we need to provide an ID (`id`), the database from which this ID comes (`dbfrom`) and the name of a database in which to find linked records (`db`). If we set this last argument to 'all' we can find links in multiple databases: ```{r elink0} all_the_links <- entrez_link(dbfrom='gene', id=351, db='all') all_the_links ``` Just as with `entrez_search` the returned object behaves like a list, and we can learn a little about its contents by printing it. In the case, all of the information is in `links` (and there's a lot of them!): ```{r elink_link} all_the_links$links ``` The names of the list elements are in the format `[source_database]_[linked_database]` and the elements themselves contain a vector of linked-IDs. So, if we want to find open access publications associated with this gene we could get linked records in PubMed Central: ```{r, elink_pmc} all_the_links$links$gene_pmc[1:10] ``` Or if were interested in this gene's role in diseases we could find links to clinVar: ```{r, elink_omim} all_the_links$links$gene_clinvar ``` ###Narrowing our focus If we know beforehand what sort of links we'd like to find , we can to use the `db` argument to narrow the focus of a call to `entrez_link`. For instance, say we are interested in knowing about all of the RNA transcripts associated with the Amyloid Beta Precursor gene in humans. Transcript sequences are stored in the nucleotide database (referred to as `nuccore` in EUtils), so to find transcripts associated with a given gene we need to set `dbfrom=gene` and `db=nuccore`. ```{r, elink1} nuc_links <- entrez_link(dbfrom='gene', id=351, db='nuccore') nuc_links nuc_links$links ``` The object we get back contains links to the nucleotide database generally, but also to special subsets of that database like [refseq](http://www.ncbi.nlm.nih.gov/refseq/). We can take advantage of this narrower set of links to find IDs that match unique transcripts from our gene of interest. ```{r, elinik_refseqs} nuc_links$links$gene_nuccore_refseqrna ``` We can use these ids in calls to `entrez_fetch()` or `entrez_summary()` to learn more about the transcripts they represent. ###External links In addition to finding data within the NCBI, `entrez_link` can turn up connections to external databases. Perhaps the most interesting example is finding links to the full text of papers in PubMed. For example, when I wrote this document the first paper linked to Amyloid Beta Precursor had a unique ID of `25500142`. We can find links to the full text of that paper with `entrez_link` by setting the `cmd` argument to 'llinks': ```{r, outlinks} paper_links <- entrez_link(dbfrom="pubmed", id=25500142, cmd="llinks") paper_links ``` Each element of the `linkouts` object contains information about an external source of data on this paper: ```{r, urls} paper_links$linkouts ``` Each of those linkout objects contains quite a lot of information, but the URL is probably the most useful. For that reason, `rentrez` provides the function `linkout_urls` to make extracting just the URL simple: ```{r just_urls} linkout_urls(paper_links) ``` The full list of options for the `cmd` argument are given in in-line documentation (`?entrez_link`). If you are interested in finding full text records for a large number of articles checkout the package [fulltext](https://github.com/ropensci/fulltext) which makes use of multiple sources (including the NCBI) to discover the full text articles. ###Using more than one ID It is possible to pass more than one ID to `entrez_link()`. By default, doing so will give you a single elink object containing the complete set of links for _all_ of the IDs that you specified. So, if you were looking for protein IDs related to specific genes you could do: ```{r, multi_default} all_links_together <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646")) all_links_together all_links_together$links$gene_protein ``` Although this behaviour might sometimes be useful, it means we've lost track of which `protein` ID is linked to which `gene` ID. To retain that information we can set `by_id` to `TRUE`. This gives us a list of elink objects, each once containing links from a single `gene` ID: ```{r, multi_byid} all_links_sep <- entrez_link(db="protein", dbfrom="gene", id=c("93100", "223646"), by_id=TRUE) all_links_sep lapply(all_links_sep, function(x) x$links$gene_protein) ``` ## Getting summary data: `entrez_summary()` Having found the unique IDs for some records via `entrez_search` or `entrez_link()`, you are probably going to want to learn something about them. The `Eutils` API has two ways to get information about a record. `entrez_fetch()` returns 'full' records in varying formats and `entrez_summary()` returns less information about each record, but in relatively simple format. Very often the summary records have the information you are after, so `rentrez` provides functions to parse and summarise summary records. ###The summary record `entrez_summary()` takes a vector of unique IDs for the samples you want to get summary information from. Let's start by finding out something about the paper describing [Taxize](https://github.com/ropensci/taxize), using its PubMed ID: ```{r, Summ_1} taxize_summ <- entrez_summary(db="pubmed", id=24555091) taxize_summ ``` Once again, the object returned by `entrez_summary` behaves like a list, so you can extract elements using `$`. For instance, we could convert our PubMed ID to another article identifier... ```{r, Summ_2} taxize_summ$articleids ``` ...or see how many times the article has been cited in PubMed Central papers ```{r, Summ_3} taxize_summ$pmcrefcount ``` ###Dealing with many records If you give `entrez_summary()` a vector with more than one ID you'll get a list of summary records back. Let's get those _Plasmodium vivax_ papers we found in the `entrez_search()` section back, and fetch some summary data on each paper: ```{r, multi_summ} vivax_search <- entrez_search(db = "pubmed", term = "(vivax malaria[MeSH]) AND (folic acid antagonists[MeSH])") multi_summs <- entrez_summary(db="pubmed", id=vivax_search$ids) ``` `rentrez` provides a helper function, `extract_from_esummary()` that takes one or more elements from every summary record in one of these lists. Here it is working with one... ```{r, multi_summ2} extract_from_esummary(multi_summs, "fulljournalname") ``` ... and several elements: ```{r, multi_summ3} date_and_cite <- extract_from_esummary(multi_summs, c("pubdate", "pmcrefcount", "title")) knitr::kable(head(t(date_and_cite)), row.names=FALSE) ``` ##Fetching full records: `entrez_fetch()` As useful as the summary records are, sometimes they just don't have the information that you need. If you want a complete representation of a record you can use `entrez_fetch`, using the argument `rettype` to specify the format you'd like the record in. ###Fetch DNA sequences in fasta format Let's extend the example given in the `entrez_link()` section about finding transcript for a given gene. This time we will fetch cDNA sequences of those transcripts.We can start by repeating the steps in the earlier example to get nucleotide IDs for refseq transcripts of two genes: ```{r, transcript_ids} gene_ids <- c(351, 11647) linked_seq_ids <- entrez_link(dbfrom="gene", id=gene_ids, db="nuccore") linked_transripts <- linked_seq_ids$links$gene_nuccore_refseqrna head(linked_transripts) ``` Now we can get our sequences with `entrez_fetch`, setting `rettype` to "fasta" (the list of formats available for [each database is give in this table](http://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/)): ```{r fetch_fasta} all_recs <- entrez_fetch(db="nuccore", id=linked_transripts, rettype="fasta") class(all_recs) nchar(all_recs) ``` Congratulations, now you have a really huge character vector! Rather than printing all those thousands of bases we can take a peak at the top of the file: ```{r, peak} cat(strwrap(substr(all_recs, 1, 500)), sep="\n") ``` If we wanted to use these sequences in some other application we could write them to file: ```r write(all_recs, file="my_transcripts.fasta") ``` Alternatively, if you want to use them within an R session we could write them to a temporary file then read that. In this case I'm using `read.dna()` from the pylogenetics package ape (but not executing the code block in this vignette, so you don't have to install that package): ```r temp <- tempfile() write(all_recs, temp) parsed_recs <- ape::read.dna(all_recs, temp) ``` ###Fetch a parsed XML document Most of the NCBI's databases can return records in XML format. In additional to downloading the text-representation of these files, `entrez_fetch()` can return objects parsed by the `XML` package. As an example, we can check out the Taxonomy database's record for (did I mention they are amazing....) _Tetrahymena thermophila_, specifying we want the result to be parsed by setting `parsed=TRUE`: ```{r, Tt_tax} Tt <- entrez_search(db="taxonomy", term="(Tetrahymena thermophila[ORGN]) AND Species[RANK]") tax_rec <- entrez_fetch(db="taxonomy", id=Tt$ids, rettype="xml", parsed=TRUE) class(tax_rec) ``` The package XML (which you have if you have installed `rentrez`) provides functions to get information from these files. For relatively simple records like this one you can use `XML::xmlToList`: ```{r, Tt_list} tax_list <- XML::xmlToList(tax_rec) tax_list$Taxon$GeneticCode ``` For more complex records, which generate deeply-nested lists, you can use [XPath expressions](https://en.wikipedia.org/wiki/XPath) along with the function `XML::xpathSApply` or the extraction operatord `[` and `[[` to extract specific parts of the file. For instance, we can get the scientific name of each taxon in _T. thermophila_'s lineage by specifying a path through the XML ```{r, Tt_path} tt_lineage <- tax_rec["//LineageEx/Taxon/ScientificName"] tt_lineage[1:4] ``` As the name suggests, `XML::xpathSApply()` is a counterpart of base R's `sapply`, and can be used to apply a function to nodes in an XML object. A particularly useful function to apply is `XML::xmlValue`, which returns the content of the node: ```{r, Tt_apply} XML::xpathSApply(tax_rec, "//LineageEx/Taxon/ScientificName", XML::xmlValue) ``` There are a few more complex examples of using `XPath` [on the rentrez wiki](https://github.com/ropensci/rentrez/wiki) ##Using NCBI's Web History features When you are dealing with very large queries it can be time consuming to pass long vectors of unique IDs to and from the NCBI. To avoid this problem, the NCBI provides a feature called "web history" which allows users to store IDs on the NCBI servers then refer to them in future calls. ###Post a set of IDs to the NCBI for later use: `entrez_post()` If you have a list of many NCBI IDs that you want to use later on, you can post them to the NCBI's severs. In order to provide a brief example, I'm going to post just one ID, the `omim` identifier for asthma: ```{r, asthma} upload <- entrez_post(db="omim", id=600807) upload ``` The NCBI sends you back some information you can use to refer to the posted IDs. In `rentrez`, that information is represented as a `web_history` object. Note that if you have a very long list of IDs you may receive a 414 error when you try to upload them. If you have such a list (and they come from an external sources rather than a search that can be save to a `web_history` object), you may have to 'chunk' the IDs into smaller sets that can processed. ###Get a `web_history` object from `entrez_search` or `entrez_link()` In addition to directly uploading IDs to the NCBI, you can use the web history features with `entrez_search` and `entrez_link`. For instance, imagine you wanted to find all of the sequences of the widely-studied gene COI from all snails (which are members of the taxonomic group Gastropoda): ```{r, snail_search} entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]") ``` That's a lot of sequences! If you really wanted to download all of these it would be a good idea to save all those IDs to the server by setting `use_history` to `TRUE` (note you now get a `web_history` object along with your normal search result): ```{r, snail_history} snail_coi <- entrez_search(db="nuccore", term="COI[Gene] AND Gastropoda[ORGN]", use_history=TRUE) snail_coi snail_coi$web_history ``` Similarity, `entrez_link()` can return `web_history` objects by using the `cmd` `neighbor_history`. Let's find genetic variants (from the clinvar database) associated with asthma (using the same OMIM ID we identified earlier): ```{r, asthma_links} asthma_clinvar <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", id=600807) asthma_clinvar$web_histories ``` As you can see, instead of returning lists of IDs for each linked database (as it would be default), `entrez_link()` now returns a list of web_histories. ###Use a `web_history` object Once you have those IDs stored on the NCBI's servers, you are going to want to do something with them. The functions `entrez_fetch()` `entrez_summary()` and `entrez_link()` can all use `web_history` objects in exactly the same way they use IDs. So, we could repeat the last example (finding variants linked to asthma), but this time using the ID we uploaded earlier ```{r, asthma_links_upload} asthma_variants <- entrez_link(dbfrom="omim", db="clinvar", cmd="neighbor_history", web_history=upload) asthma_variants ``` ... if we want to get some genetic information about these variants we need to map our clinvar IDs to SNP IDs: ```{r, links} snp_links <- entrez_link(dbfrom="clinvar", db="snp", web_history=asthma_variants$web_histories$omim_clinvar, cmd="neighbor_history") snp_summ <- entrez_summary(db="snp", web_history=snp_links$web_histories$clinvar_snp) knitr::kable(extract_from_esummary(snp_summ, c("chr", "fxn_class", "global_maf"))) ``` If you really wanted to you could also use `web_history` objects to download all those thousands of COI sequences. When downloading large sets of data, it is a good idea to take advantage of the arguments `retmax` and `restart` to split the request up into smaller chunks. For instance, we could get the first 200 sequences in 50-sequence chunks: (note: this code block is not executed as part of the vignette to save time and bandwidth): ```r for( seq_start in seq(1,200,50)){ recs <- entrez_fetch(db="nuccore", web_history=snail_coi$web_history, rettype="fasta", retmax=50, retstart=seq_start) cat(recs, file="snail_coi.fasta", append=TRUE) cat(seq_start+49, "sequences downloaded\r") } ``` ## Using API Keys By default, the NCBI limits users to making only 3 requests per second (and `rentrez` enforces that limit). Users who register for an "API key" are able to make up to ten requests per second. Getting one of these keys is simple, you just need to [register for "my ncbi" account](https://www.ncbi.nlm.nih.gov/account/) then click on a button in the [account settings page](https://www.ncbi.nlm.nih.gov/account/settings/). Once you have an API key, rentrez will allow you to take advantage of it. For one-off cases, this is as simple as adding the `api_key` argument to given function call. (Note these examples are not executed as part of this document, as the API key used here not a real one). ```r entrez_link(db="protein", dbfrom="gene", id=93100, api_key ="ABCD123") ``` It most cases you will want to use your API for each of several calls to the NCBI. `rentrez` makes this easy by allowing you to set an environment variable ,`ENTREZ_KEY`. Once this value is set to your key `rentrez` will use it for all requests to the NCBI. To set the value for a single R session you can use the function `set_entrez_key()`. Here we set the value and confirm it is available. ```{r, set_key} set_entrez_key("ABCD123") Sys.getenv("ENTREZ_KEY") ``` If you use `rentrez` often you should edit your `.Renviron` file (see `r help(Startup)` for description of this file) to include your key. Doing so will mean all requests you send will take advantage of your API key. ```ini ENTREZ_KEY=ABCD123 ``` As long as an API key is set by one of these methods, `rentrez` will allow you to make up to ten requests per second. ## What next ? This tutorial has introduced you to the core functions of `rentrez`, there are almost limitless ways that you could put them together. [Check out the wiki](https://github.com/ropensci/rentrez/wiki) for more specific examples, and be sure to read the inline-documentation for each function. If you run into problem with rentrez, or just need help with the package and `Eutils` please contact us by opening an issue at the [github repository](https://github.com/ropensci/rentrez/issues) rentrez/MD50000644000176200001440000000600113240156221012244 0ustar liggesusers53f95884b3c88adf2e212fae1d497d3a *DESCRIPTION ce28e10e378a31d3a383fab9edec5b0c *LICENSE 604e68831f408f3fe00d376c2573ad46 *NAMESPACE a159ff890d30e3d2d46f113e95af85f2 *NEWS a917b8bcbc84e6283def96e3373f5b66 *R/api_keys.r a3d73ccb9830f7ee143ddbe452982887 *R/base.r 111faade12c3113b9e790591a06e4d6e *R/entrez_citmatch.r 699061f94eec757936b8b28f2dca9429 *R/entrez_fetch.r 67e01c03decfcbaa840dad8e062a566a *R/entrez_global_query.r 033301b7c9d56c3a42748f7831967acd *R/entrez_info.r 2d6c5115a7659ca7f046fc996cf4bdc8 *R/entrez_link.r 05bdb8737863eb909a3b160925f57fff *R/entrez_post.r 8fdb21ace7a50413c1c5815bd9bb9f2f *R/entrez_search.r 3a42c38db7146642a4ceddbbc794c9f4 *R/entrez_summary.r 8bc803d43b3e90e932d6c82894f59650 *R/help.r 646f614d14b267f07f6289e0cc54f357 *R/parse_pubmed_xml.r 1eea29e0728d29fc4319d61ad0b9d96d *build/vignette.rds 80141bd00c004447a7e9c87eb2d0cdab *inst/CITATION 9feb6e1060bb1a02aa19fedfb364664c *inst/doc/rentrez_tutorial.R 9ba8cf55cc7bbc9bdcb94bc7e46107c1 *inst/doc/rentrez_tutorial.Rmd 3e6a164683a575c16b5ead301978b2a9 *inst/doc/rentrez_tutorial.html 4c2c03b2da5998fc705ba3fecb9b11ec *man/entrez_citmatch.Rd c5d2dfb20a19258305f66992b9a6eb93 *man/entrez_db_links.Rd 1bbec18591b69b1f24e21009536f50ba *man/entrez_db_searchable.Rd b76c339678acd47b16fc4941fcb48998 *man/entrez_db_summary.Rd 58c939254eded7cc617963dac52d78cc *man/entrez_dbs.Rd 753796a12579f37f03a5fdd689282a98 *man/entrez_fetch.Rd 42e2cf420693e3e3970bc952c49e0ee1 *man/entrez_global_query.Rd 241c6bacdf3fad63d468e6ffc683c7b2 *man/entrez_info.Rd 056c172b38cfbd925832129cbb5827a7 *man/entrez_link.Rd c3bb44bdde21b463be5f153e7c862609 *man/entrez_post.Rd cb6cac8536ee90bcfd7e8f1b1fe7b4b0 *man/entrez_search.Rd f9aeb339517b0e8932ec0447962f2605 *man/entrez_summary.Rd 12aac656a27d8539dbb75734891339f9 *man/extract_from_esummary.Rd d563be6b5b0c28e82f1f9837b7f33d2c *man/linkout_urls.Rd f63b2726b9639edd7faa24e57fc828a3 *man/parse_pubmed_xml.Rd b4a0e94018cb01c5c00da6bb5ed1a835 *man/rentrez.Rd 08509834a23432d8bb70d64f61724d16 *man/set_entrez_key.Rd db04e7147a14d952e0ae8c93d1390087 *tests/test-all.R 42a661e59c02374e0dff643090d7d110 *tests/testthat/test_api_key.r 0c4b51d40ae63cbfdcfac31cd67edb96 *tests/testthat/test_citmatch.r 4edd85844f931fee501b87861537459c *tests/testthat/test_docs.r 9391f49d755372af5e9faef10553169c *tests/testthat/test_fetch.r a4c45c8f355eafbc660aede214e6f526 *tests/testthat/test_httr.r 90a64274ba59f7232cf0adc3cbe8e86e *tests/testthat/test_httr_post.r 6f1a4c681ca3b43318b45fdf4a87221f *tests/testthat/test_info.r f7f1fe31b6a902289daae7fe6e1b3554 *tests/testthat/test_link.r 1ac649cfb5ba8744d2d62ef182c6bad9 *tests/testthat/test_net.r d9c769bcd94e0464e232d79ca6a063db *tests/testthat/test_parse.r a2bfe354a3cecc892df44f53fd6c9f58 *tests/testthat/test_post.r ae5359bee5086c5748a01c6a317a8b0a *tests/testthat/test_query.r d8b42d6257b50ed6dba47f0d00c28875 *tests/testthat/test_search.r 8fdddfa6eb899bfaf3a6c95414284c89 *tests/testthat/test_summary.r c435f7927e6e2f015bf43780f4957e1e *tests/testthat/test_webenv.r 9ba8cf55cc7bbc9bdcb94bc7e46107c1 *vignettes/rentrez_tutorial.Rmd rentrez/build/0000755000176200001440000000000013240146763013050 5ustar liggesusersrentrez/build/vignette.rds0000644000176200001440000000032213240146763015404 0ustar liggesusersb```b`ffb`b2 1# ')J+)J/)-/L MAS#QU&/aFFI."!L&$yhư楀aM wjey~L6̜T!%ps QY_/(. @hrNb1GRKҊA'rentrez/DESCRIPTION0000644000176200001440000000243313240156221013447 0ustar liggesusersPackage: rentrez Version: 1.2.0 Date: 2018-02-12 Title: Entrez in R Authors@R: c( person("David", "Winter", role=c("aut", "cre"), email = "david.winter@gmail.com", comment = c("ORCID = 0000-0002-6165-0029")), person("Scott", "Chamberlain", role="ctb", email = "myrmecocystus@gmail.com", comment = c("ORCID = 0000-0003-1444-9135")), person("Han","Guangchun", role=c("ctb"),email="hanguangchun@gmail.com", comment = c("ORCID = 0000-0001-9277-2507")) ) Depends: R (>= 2.6.0) Imports: XML, httr (>= 0.5), jsonlite (>= 0.9) Suggests: testthat, knitr, rmarkdown URL: http://github.com/ropensci/rentrez BugReports: https://github.com/ropensci/rentrez/issues Description: Provides an R interface to the NCBI's EUtils API allowing users to search databases like GenBank and PubMed, process the results of those searches and pull data into their R sessions. VignetteBuilder: knitr License: MIT + file LICENSE RoxygenNote: 6.0.1 NeedsCompilation: no Packaged: 2018-02-11 23:12:51 UTC; david Author: David Winter [aut, cre] (ORCID = 0000-0002-6165-0029), Scott Chamberlain [ctb] (ORCID = 0000-0003-1444-9135), Han Guangchun [ctb] (ORCID = 0000-0001-9277-2507) Maintainer: David Winter Repository: CRAN Date/Publication: 2018-02-12 00:15:13 UTC rentrez/man/0000755000176200001440000000000013237161662012525 5ustar liggesusersrentrez/man/entrez_dbs.Rd0000644000176200001440000000114213111437373015145 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_info.r \name{entrez_dbs} \alias{entrez_dbs} \title{List databases available from the NCBI} \usage{ entrez_dbs(config = NULL) } \arguments{ \item{config}{config vector passed to \code{httr::GET}} } \value{ character vector listing available dbs } \description{ Retrieves the names of databases available through the EUtils API } \examples{ \donttest{ entrez_dbs() } } \seealso{ Other einfo: \code{\link{entrez_db_links}}, \code{\link{entrez_db_searchable}}, \code{\link{entrez_db_summary}}, \code{\link{entrez_info}} } rentrez/man/entrez_db_searchable.Rd0000644000176200001440000000231013111437373017131 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_info.r \name{entrez_db_searchable} \alias{entrez_db_searchable} \title{List available search fields for a given database} \usage{ entrez_db_searchable(db, config = NULL) } \arguments{ \item{db}{character, name of database to get search field from} \item{config}{config vector passed to \code{httr::GET}} } \value{ An eInfoSearch object (subclassed from list) summarizing linked-databases. Can be coerced to a data-frame with \code{as.data.frame}. Printing the object shows only the names of each available search field. } \description{ Fetch a list of search fields that can be used with a given database. Fields can be used as part of the \code{term} argument to \code{\link{entrez_search}} } \examples{ \donttest{ pmc_fields <- entrez_db_searchable("pmc") pmc_fields[["AFFL"]] entrez_search(db="pmc", term="Otago[AFFL]", retmax=0) entrez_search(db="pmc", term="Auckland[AFFL]", retmax=0) sra_fields <- entrez_db_searchable("sra") as.data.frame(sra_fields) } } \seealso{ \code{\link{entrez_search}} Other einfo: \code{\link{entrez_db_links}}, \code{\link{entrez_db_summary}}, \code{\link{entrez_dbs}}, \code{\link{entrez_info}} } rentrez/man/entrez_db_links.Rd0000644000176200001440000000247613111437373016175 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_info.r \name{entrez_db_links} \alias{entrez_db_links} \title{List available links for records from a given NCBI database} \usage{ entrez_db_links(db, config = NULL) } \arguments{ \item{db}{character, name of database to search} \item{config}{config vector passed to \code{httr::GET}} } \value{ An eInfoLink object (sub-classed from list) summarizing linked-databases. Can be coerced to a data-frame with \code{as.data.frame}. Printing the object the name of each element (which is the correct name for \code{entrez_link}, and can be used to get (a little) more information about each linked database (see example below). } \description{ For a given database, fetch a list of other databases that contain cross-referenced records. The names of these records can be used as the \code{db} argument in \code{\link{entrez_link}} } \examples{ \donttest{ taxid <- entrez_search(db="taxonomy", term="Osmeriformes")$ids tax_links <- entrez_db_links("taxonomy") tax_links entrez_link(dbfrom="taxonomy", db="pmc", id=taxid) sra_links <- entrez_db_links("sra") as.data.frame(sra_links) } } \seealso{ \code{\link{entrez_link}} Other einfo: \code{\link{entrez_db_searchable}}, \code{\link{entrez_db_summary}}, \code{\link{entrez_dbs}}, \code{\link{entrez_info}} } rentrez/man/entrez_link.Rd0000644000176200001440000000610313240132733015327 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_link.r \name{entrez_link} \alias{entrez_link} \title{Get links to datasets related to records from an NCBI database} \usage{ entrez_link(dbfrom, web_history = NULL, id = NULL, db = NULL, cmd = "neighbor", by_id = FALSE, config = NULL, ...) } \arguments{ \item{dbfrom}{character Name of database from which the Id(s) originate} \item{web_history}{a web_history object} \item{id}{vector with unique ID(s) for records in database \code{db}.} \item{db}{character Name of the database to search for links (or use "all" to search all databases available for \code{db}. \code{entrez_db_links} allows you to discover databases that might have linked information (see examples).} \item{cmd}{link function to use. Allowed values include \itemize{ \item neighbor (default). Returns a set of IDs in \code{db} linked to the input IDs in \code{dbfrom}. \item neighbor_score. As `neighbor'', but additionally returns similarity scores. \item neighbor_history. As `neighbor', but returns web history objects. \item acheck. Returns a list of linked databases available from NCBI for a set of IDs. \item ncheck. Checks for the existence of links within a single database. \item lcheck. Checks for external (i.e. outside NCBI) links. \item llinks. Returns a list of external links for each ID, excluding links provided by libraries. \item llinkslib. As 'llinks' but additionally includes links provided by libraries. \item prlinks. As 'llinks' but returns only the primary external link for each ID. }} \item{by_id}{logical If FALSE (default) return a single \code{elink} objects containing links for all of the provided \code{id}s. Alternatively, if TRUE return a list of \code{elink} objects, one for each ID in \code{id}.} \item{config}{vector configuration options passed to httr::GET} \item{\dots}{character Additional terms to add to the request, see NCBI documentation linked to in references for a complete list} } \value{ An elink object containing the data defined by the \code{cmd} argument (if by_id=FALSE) or a list of such object (if by_id=TRUE). file XMLInternalDocument xml file resulting from search, parsed with \code{\link{xmlTreeParse}} } \description{ Discover records related to a set of unique identifiers from an NCBI database. The object returned by this function depends on the value set for the \code{cmd} argument. Printing the returned object lists the names , and provides a brief description, of the elements included in the object. } \examples{ \donttest{ pubmed_search <- entrez_search(db = "pubmed", term ="10.1016/j.ympev.2010.07.013[doi]") linked_dbs <- entrez_db_links("pubmed") linked_dbs nucleotide_data <- entrez_link(dbfrom = "pubmed", id = pubmed_search$ids, db ="nuccore") #Sources for the full text of the paper res <- entrez_link(dbfrom="pubmed", db="", cmd="llinks", id=pubmed_search$ids) linkout_urls(res) } } \references{ \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ELink_} } \seealso{ \code{\link[httr]{config}} for available configs \code{entrez_db_links} } rentrez/man/entrez_post.Rd0000644000176200001440000000254213111437373015367 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_post.r \name{entrez_post} \alias{entrez_post} \title{Post IDs to Eutils for later use} \usage{ entrez_post(db, id = NULL, web_history = NULL, config = NULL, ...) } \arguments{ \item{db}{character Name of the database from which the IDs were taken} \item{id}{vector with unique ID(s) for records in database \code{db}.} \item{web_history}{A web_history object. Can be used to add to additional identifiers to an existing web environment on the NCBI} \item{config}{vector of configuration options passed to httr::GET} \item{\dots}{character Additional terms to add to the request, see NCBI documentation linked to in references for a complete list} } \description{ Post IDs to Eutils for later use } \examples{ \dontrun{ so_many_snails <- entrez_search(db="nuccore", "Gastropoda[Organism] AND COI[Gene]", retmax=200) upload <- entrez_post(db="nuccore", id=so_many_snails$ids) first <- entrez_fetch(db="nuccore", rettype="fasta", web_history=upload, retmax=10) second <- entrez_fetch(db="nuccore", file_format="fasta", web_history=upload, retstart=10, retmax=10) } } \references{ \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_EPost_} } \seealso{ \code{\link[httr]{config}} for available httr configurations } rentrez/man/entrez_summary.Rd0000644000176200001440000000713613111437373016103 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_summary.r \name{entrez_summary} \alias{entrez_summary} \title{Get summaries of objects in NCBI datasets from a unique ID} \usage{ entrez_summary(db, id = NULL, web_history = NULL, version = c("2.0", "1.0"), always_return_list = FALSE, retmode = NULL, config = NULL, ...) } \arguments{ \item{db}{character Name of the database to search for} \item{id}{vector with unique ID(s) for records in database \code{db}. In the case of sequence databases these IDs can take form of an NCBI accession followed by a version number (eg AF123456.1 or AF123456.2)} \item{web_history}{A web_history object} \item{version}{either 1.0 or 2.0 see above for description} \item{always_return_list}{logical, return a list of esummary objects even when only one ID is provided (see description for a note about this option)} \item{retmode}{either "xml" or "json". By default, xml will be used for version 1.0 records, json for version 2.0.} \item{config}{vector configuration options passed to \code{httr::GET}} \item{\dots}{character Additional terms to add to the request, see NCBI documentation linked to in references for a complete list} } \value{ A list of esummary records (if multiple IDs are passed and always_return_list if FALSE) or a single record. file XMLInternalDocument xml file containing the entire record returned by the NCBI. } \description{ The NCBI offer two distinct formats for summary documents. Version 1.0 is a relatively limited summary of a database record based on a shared Document Type Definition. Version 1.0 summaries are only available as XML and are not available for some newer databases Version 2.0 summaries generally contain more information about a given record, but each database has its own distinct format. 2.0 summaries are available for records in all databases and as JSON and XML files. As of version 0.4, rentrez fetches version 2.0 summaries by default and uses JSON as the exchange format (as JSON object can be more easily converted into native R types). Existing scripts which relied on the structure and naming of the "Version 1.0" summary files can be updated by setting the new \code{version} argument to "1.0". } \details{ By default, entrez_summary returns a single record when only one ID is passed and a list of such records when multiple IDs are passed. This can lead to unexpected behaviour when the results of a variable number of IDs (perhaps the result of \code{entrez_search}) are processed with an apply family function or in a for-loop. If you use this function as part of a function or script that generates a variably-sized vector of IDs setting \code{always_return_list} to \code{TRUE} will avoid these problems. The function \code{extract_from_esummary} is provided for the specific case of extracting named elements from a list of esummary objects, and is designed to work on single objects as well as lists. } \examples{ \donttest{ pop_ids = c("307082412", "307075396", "307075338", "307075274") pop_summ <- entrez_summary(db="popset", id=pop_ids) extract_from_esummary(pop_summ, "title") # clinvar example res <- entrez_search(db = "clinvar", term = "BRCA1", retmax=10) cv <- entrez_summary(db="clinvar", id=res$ids) cv extract_from_esummary(cv, "title", simplify=FALSE) extract_from_esummary(cv, "trait_set")[1:2] extract_from_esummary(cv, "gene_sort") } } \references{ \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESummary_} } \seealso{ \code{\link[httr]{config}} for available configs \code{\link{extract_from_esummary}} which can be used to extract elements from a list of esummary records } rentrez/man/set_entrez_key.Rd0000644000176200001440000000143313237161662016047 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/api_keys.r \name{set_entrez_key} \alias{set_entrez_key} \title{Set the ENTREZ_KEY variable to be used by all rentrez functions} \usage{ set_entrez_key(key) } \arguments{ \item{key}{character. Value to set ENTREZ_KEY to (i.e. your API key).} } \value{ A logical of length one, TRUE is the value was set FALSE if not. value is returned inside invisible(), i.e. it is not printed to screen when the function is called. } \description{ The NCBI allows users to access more records (10 per second) if they register for and use an API key. This function allows users to set this key for all calls to rentrez functions during a particular R session. See the vignette section "Using API keys" for a detailed description. } rentrez/man/entrez_db_summary.Rd0000644000176200001440000000166213111437373016546 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_info.r \name{entrez_db_summary} \alias{entrez_db_summary} \title{Retrieve summary information about an NCBI database} \usage{ entrez_db_summary(db, config = NULL) } \arguments{ \item{db}{character, name of database to summaries} \item{config}{config vector passed to \code{httr::GET}} } \value{ Character vector with the following data DbName Name of database Description Brief description of the database Count Number of records contained in the database MenuName Name in web-interface to EUtils DbBuild Unique ID for current build of database LastUpdate Date of most recent update to database } \description{ Retrieve summary information about an NCBI database } \examples{ entrez_db_summary("pubmed") } \seealso{ Other einfo: \code{\link{entrez_db_links}}, \code{\link{entrez_db_searchable}}, \code{\link{entrez_dbs}}, \code{\link{entrez_info}} } rentrez/man/parse_pubmed_xml.Rd0000644000176200001440000000162613111437373016343 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/parse_pubmed_xml.r \name{parse_pubmed_xml} \alias{parse_pubmed_xml} \title{Summarize an XML record from pubmed.} \usage{ parse_pubmed_xml(record) } \arguments{ \item{record}{Either and XMLInternalDocument or character the record to be parsed ( expected to come from \code{\link{entrez_fetch}})} } \value{ Either a single pubmed_record object, or a list of several } \description{ Note: this function assumes all records are of the type "PubmedArticle" and will return an empty record for any other type (including books). } \examples{ hox_paper <- entrez_search(db="pubmed", term="10.1038/nature08789[doi]") hox_rel <- entrez_link(db="pubmed", dbfrom="pubmed", id=hox_paper$ids) recs <- entrez_fetch(db="pubmed", id=hox_rel$links$pubmed_pubmed[1:3], rettype="xml") parse_pubmed_xml(recs) } rentrez/man/entrez_info.Rd0000644000176200001440000000270213111437373015333 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_info.r \name{entrez_info} \alias{entrez_info} \title{Get information about EUtils databases} \usage{ entrez_info(db = NULL, config = NULL) } \arguments{ \item{db}{character database about which to retrieve information (optional)} \item{config}{config vector passed on to \code{httr::GET}} } \value{ XMLInternalDocument with information describing either all the databases available in Eutils (if db is not set) or one particular database (set by 'db') } \description{ Gather information about EUtils generally, or a given Eutils database. Note: The most common uses-cases for the einfo util are finding the list of search fields available for a given database or the other NCBI databases to which records in a given database might be linked. Both these use cases are implemented in higher-level functions that return just this information (\code{entrez_db_searchable} and \code{entrez_db_links} respectively). Consequently most users will not have a reason to use this function (though it is exported by \code{rentrez} for the sake of completeness. } \examples{ \dontrun{ all_the_data <- entrez_info() XML::xpathSApply(all_the_data, "//DbName", xmlValue) entrez_dbs() } } \seealso{ \code{\link[httr]{config}} for available httr configurations Other einfo: \code{\link{entrez_db_links}}, \code{\link{entrez_db_searchable}}, \code{\link{entrez_db_summary}}, \code{\link{entrez_dbs}} } rentrez/man/linkout_urls.Rd0000644000176200001440000000067713111437373015554 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_link.r \name{linkout_urls} \alias{linkout_urls} \title{Extract URLs from an elink object} \usage{ linkout_urls(elink) } \arguments{ \item{elink}{elink object (returned by entrez_link) containing Urls} } \value{ list of character vectors, one per ID each containing of URLs for that ID. } \description{ Extract URLs from an elink object } \seealso{ entrez_link } rentrez/man/entrez_fetch.Rd0000644000176200001440000000514413111437373015474 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_fetch.r \name{entrez_fetch} \alias{entrez_fetch} \title{Download data from NCBI databases} \usage{ entrez_fetch(db, id = NULL, web_history = NULL, rettype, retmode = "", parsed = FALSE, config = NULL, ...) } \arguments{ \item{db}{character, name of the database to use} \item{id}{vector (numeric or character), unique ID(s) for records in database \code{db}. In the case of sequence databases these IDs can take form of an NCBI accession followed by a version number (eg AF123456.1 or AF123456.2).} \item{web_history, }{a web_history object} \item{rettype}{character, format in which to get data (eg, fasta, xml...)} \item{retmode}{character, mode in which to receive data, defaults to 'text'} \item{parsed}{boolean should entrez_fetch attempt to parse the resulting file. Only works with xml records (including those with rettypes other than "xml") at present} \item{config}{vector, httr configuration options passed to httr::GET} \item{\dots}{character, additional terms to add to the request, see NCBI documentation linked to in references for a complete list} } \value{ character string containing the file created XMLInternalDocument a parsed XML document if parsed=TRUE and rettype is a flavour of XML. } \description{ A set of unique identifiers mush be specified with either the \code{db} argument (which directly specifies the IDs as a numeric or character vector) or a \code{web_history} object as returned by \code{\link{entrez_link}}, \code{\link{entrez_search}} or \code{\link{entrez_post}}. See \href{https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/}{Table 1} in the linked reference for the set of formats available for each database. In particular, note that sequence databases (nuccore, protein and their relatives) use specific format names (eg "native", "ipg") for different flavours of xml. } \details{ For the most part, this function returns a character vector containing the fetched records. For XML records (including 'native', 'ipg', 'gbc' sequence records), setting \code{parsed} to \code{TRUE} will return an \code{XMLInternalDocument}, } \examples{ \dontrun{ katipo <- "Latrodectus katipo[Organism]" katipo_search <- entrez_search(db="nuccore", term=katipo) kaitpo_seqs <- entrez_fetch(db="nuccore", id=katipo_search$ids, rettype="fasta") #xml kaitpo_seqs <- entrez_fetch(db="nuccore", id=katipo_search$ids, rettype="native") } } \references{ \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_EFetch_} } \seealso{ \code{\link[httr]{config}} for available '\code{httr}` configs } rentrez/man/rentrez.Rd0000644000176200001440000000176013111437373014505 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/help.r \docType{package} \name{rentrez} \alias{rentrez} \alias{rentrez-package} \alias{rentrez-package} \title{rentrez} \description{ rentrez provides functions to search for, discover and download data from the NCBI's databases using their EUtils function. } \details{ Users are expected to know a little bit about the EUtils API, which is well documented: \url{http://www.ncbi.nlm.nih.gov/books/NBK25500/} The NCBI will ban IPs that don't use EUtils within their \href{http://www.ncbi.nlm.nih.gov/corehtml/query/static/eutils_help.html}{user guidelines}. In particular /enumerated{ /item Don't send more than three request per second (rentrez enforces this limit) /item If you plan on sending a sequence of more than ~100 requests, do so outside of peak times for the US /item For large requests use the web history method (see examples for \code{\link{entrez_search}} or use \code{\link{entrez_post}} to upload IDs) } } rentrez/man/entrez_global_query.Rd0000644000176200001440000000141513111437373017065 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_global_query.r \name{entrez_global_query} \alias{entrez_global_query} \title{Find the number of records that match a given term across all NCBI Entrez databases} \usage{ entrez_global_query(term, config = NULL, ...) } \arguments{ \item{term}{the search term to use} \item{config}{vector configuration options passed to httr::GET} \item{...}{additional arguments to add to the query} } \value{ a named vector with counts for each a database } \description{ Find the number of records that match a given term across all NCBI Entrez databases } \examples{ NCBI_data_on_best_butterflies_ever <- entrez_global_query(term="Heliconius") } \seealso{ \code{\link[httr]{config}} for available configs } rentrez/man/entrez_search.Rd0000644000176200001440000000704313240132733015643 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_search.r \name{entrez_search} \alias{entrez_search} \title{Search the NCBI databases using EUtils} \usage{ entrez_search(db, term, config = NULL, retmode = "xml", use_history = FALSE, ...) } \arguments{ \item{db}{character, name of the database to search for.} \item{term}{character, the search term. The syntax used in making these searches is described in the Details of this help message, the package vignette and reference given below.} \item{config}{vector configuration options passed to httr::GET} \item{retmode}{character, one of json (default) or xml. This will make no difference in most cases.} \item{use_history}{logical. If TRUE return a web_history object for use in later calls to the NCBI} \item{\dots}{character, additional terms to add to the request, see NCBI documentation linked to in references for a complete list} } \value{ ids integer Unique IDS returned by the search count integer Total number of hits for the search retmax integer Maximum number of hits returned by the search web_history A web_history object for use in subsequent calls to NCBI QueryTranslation character, search term as the NCBI interpreted it file either and XMLInternalDocument xml file resulting from search, parsed with \code{\link[XML]{xmlTreeParse}} or, if \code{retmode} was set to json a list resulting from the returned JSON file being parsed with \code{\link[jsonlite]{fromJSON}}. } \description{ Search a given NCBI database with a particular query. } \details{ The NCBI uses a search term syntax where search terms can be associated with a specific search field with square brackets. So, for instance ``Homo[ORGN]'' denotes a search for Homo in the ``Organism'' field. The names and definitions of these fields can be identified using \code{\link{entrez_db_searchable}}. Searches can make use of several fields by combining them via the boolean operators AND, OR and NOT. So, using the search term``((Homo[ORGN] AND APP[GENE]) NOT Review[PTYP])'' in PubMed would identify articles matching the gene APP in humans, and exclude review articles. More examples of the use of these search terms, and the more specific MeSH terms for precise searching, is given in the package vignette. \code{rentrez} handles special characters and URL encoding (e.g. replacing spaces with plus signs) on the client side, so there is no need to include these in search term The\code{rentrez} tutorial provides some tips on how to make the most of searches to the NCBI. In particular, the sections on uses of the "Filter" field and MeSH terms may in formulating precise searches. } \examples{ \dontrun{ query <- "Gastropoda[Organism] AND COI[Gene]" web_env_search <- entrez_search(db="nuccore", query, use_history=TRUE) cookie <- web_env_search$WebEnv qk <- web_env_search$QueryKey snail_coi <- entrez_fetch(db = "nuccore", WebEnv = cookie, query_key = qk, file_format = "fasta", retmax = 10) } \donttest{ fly_id <- entrez_search(db="taxonomy", term="Drosophila") #Oh, right. There is a genus and a subgenus name Drosophila... #how can we limit this search (tax_fields <- entrez_db_searchable("taxonomy")) #"RANK" loots promising tax_fields$RANK entrez_search(db="taxonomy", term="Drosophila & Genus[RANK]") } } \references{ \url{http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_} } \seealso{ \code{\link[httr]{config}} for available httr configurations \code{\link{entrez_db_searchable}} to get a set of search fields that can be used in \code{term} for any database } rentrez/man/extract_from_esummary.Rd0000644000176200001440000000107313111437373017430 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_summary.r \name{extract_from_esummary} \alias{extract_from_esummary} \title{Extract elements from a list of esummary records} \usage{ extract_from_esummary(esummaries, elements, simplify = TRUE) } \arguments{ \item{esummaries}{A list of esummary objects} \item{elements}{the names of the element to extract} \item{simplify}{logical, if possible return a vector} } \value{ List or vector containing requested elements } \description{ Extract elements from a list of esummary records } rentrez/man/entrez_citmatch.Rd0000644000176200001440000000241013111437373016170 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/entrez_citmatch.r \name{entrez_citmatch} \alias{entrez_citmatch} \title{Fetch pubmed ids matching specially formatted citation strings} \usage{ entrez_citmatch(bdata, db = "pubmed", retmode = "xml", config = NULL) } \arguments{ \item{bdata}{character, containing citation data. Each citation must be represented in a pipe-delimited format journal_title|year|volume|first_page|author_name|your_key| The final field "your_key" is arbitrary, and can used as you see fit. Fields can be left empty, but be sure to keep 6 pipes.} \item{db}{character, the database to search. Defaults to pubmed, the only database currently available} \item{retmode}{character, file format to retrieve. Defaults to xml, as per the API documentation, though note the API only returns plain text} \item{config}{vector configuration options passed to httr::GET} } \value{ A character vector containing PMIDs } \description{ Fetch pubmed ids matching specially formatted citation strings } \examples{ \donttest{ ex_cites <- c("proc natl acad sci u s a|1991|88|3248|mann bj|test1|", "science|1987|235|182|palmenberg ac|test2|") entrez_citmatch(ex_cites) } } \seealso{ \code{\link[httr]{config}} for available configs } rentrez/LICENSE0000644000176200001440000000005713237161636012762 0ustar liggesusersYEAR: 2012-2018 COPYRIGHT HOLDER: David Winter