11# ' Download representative sequences for a taxon
2- # '
3- # ' Downloads a sample of sequences meant to evenly capture the diversity of a given taxon.
4- # ' Can be used to get a shallow sampling of vast groups.
5- # ' \strong{CAUTION:} This function can make MANY queries to Genbank depending on arguments given and
6- # ' can take a very long time.
7- # ' Choose your arguments carefully to avoid long waits and needlessly stressing NCBI's servers.
8- # ' Use a downloaded database and a parser from the \code{taxa} package when possible.
9- # '
10- # ' @param name (\code{character} of length 1) The taxon to download a sample of sequences for.
11- # ' @param id (\code{character} of length 1) The taxon id to download a sample of sequences for.
12- # ' @param target_rank (\code{character} of length 1) The finest taxonomic rank at which
13- # ' to sample. The finest rank at which replication occurs. Must be a finer rank than
14- # ' \code{taxon}.
15- # ' @param min_counts (named \code{numeric}) The minimum number of sequences to download for each
16- # ' taxonomic rank. The names correspond to taxonomic ranks.
17- # ' @param max_counts (named \code{numeric}) The maximum number of sequences to download for each
18- # ' taxonomic rank. The names correspond to taxonomic ranks.
19- # ' @param interpolate_min (\code{logical}) If \code{TRUE}, values supplied to \code{min_counts}
20- # ' and \code{min_children} will be used to infer the values of intermediate ranks not
21- # ' specified. Linear interpolation between values of specified ranks will be used to determine
22- # ' values of unspecified ranks.
23- # ' @param interpolate_max (\code{logical}) If \code{TRUE}, values supplied to \code{max_counts}
24- # ' and \code{max_children} will be used to infer the values of intermediate ranks not
25- # ' specified. Linear interpolation between values of specified ranks will be used to determine
26- # ' values of unspecified ranks.
27- # ' @param min_children (named \code{numeric}) The minimum number sub-taxa of taxa for a given
28- # ' rank must have for its sequences to be searched. The names correspond to taxonomic ranks.
29- # ' @param max_children (named \code{numeric}) The maximum number sub-taxa of taxa for a given
30- # ' rank must have for its sequences to be searched. The names correspond to taxonomic ranks.
31- # ' @param verbose (\code{logical}) If \code{TRUE}, progress messages will be printed.
32- # ' @inheritParams traits::ncbi_searcher
33- # '
2+ # '
3+ # ' Downloads a sample of sequences meant to evenly capture the diversity of a
4+ # ' given taxon. Can be used to get a shallow sampling of vast groups.
5+ # ' \strong{CAUTION:} This function can make MANY queries to Genbank depending on
6+ # ' arguments given and can take a very long time. Choose your arguments
7+ # ' carefully to avoid long waits and needlessly stressing NCBI's servers. Use a
8+ # ' downloaded database and a parser from the \code{taxa} package when possible.
9+ # '
10+ # ' @param name (\code{character} of length 1) The taxon to download a sample of
11+ # ' sequences for.
12+ # ' @param id (\code{character} of length 1) The taxon id to download a sample of
13+ # ' sequences for.
14+ # ' @param target_rank (\code{character} of length 1) The finest taxonomic rank
15+ # ' at which to sample. The finest rank at which replication occurs. Must be a
16+ # ' finer rank than \code{taxon}.
17+ # ' @param min_counts (named \code{numeric}) The minimum number of sequences to
18+ # ' download for each taxonomic rank. The names correspond to taxonomic ranks.
19+ # ' @param max_counts (named \code{numeric}) The maximum number of sequences to
20+ # ' download for each taxonomic rank. The names correspond to taxonomic ranks.
21+ # ' @param interpolate_min (\code{logical}) If \code{TRUE}, values supplied to
22+ # ' \code{min_counts} and \code{min_children} will be used to infer the values
23+ # ' of intermediate ranks not specified. Linear interpolation between values of
24+ # ' specified ranks will be used to determine values of unspecified ranks.
25+ # ' @param interpolate_max (\code{logical}) If \code{TRUE}, values supplied to
26+ # ' \code{max_counts} and \code{max_children} will be used to infer the values
27+ # ' of intermediate ranks not specified. Linear interpolation between values of
28+ # ' specified ranks will be used to determine values of unspecified ranks.
29+ # ' @param min_children (named \code{numeric}) The minimum number sub-taxa of
30+ # ' taxa for a given rank must have for its sequences to be searched. The names
31+ # ' correspond to taxonomic ranks.
32+ # ' @param max_children (named \code{numeric}) The maximum number sub-taxa of
33+ # ' taxa for a given rank must have for its sequences to be searched. The names
34+ # ' correspond to taxonomic ranks.
35+ # ' @param seqrange (character) Sequence range, as e.g., "1:1000". This is the
36+ # ' range of sequence lengths to search for. So "1:1000" means search for
37+ # ' sequences from 1 to 1000 characters in length.
38+ # ' @param getrelated (logical) If TRUE, gets the longest sequences of a species
39+ # ' in the same genus as the one searched for. If FALSE, returns nothing if no
40+ # ' match found.
41+ # ' @param fuzzy (logical) Whether to do fuzzy taxonomic ID search or exact
42+ # ' search. If \code{TRUE}, we use \code{xXarbitraryXx[porgn:__txid<ID>]}, but
43+ # ' if \code{FALSE}, we use \code{txid<ID>}. Default: \code{FALSE}
44+ # ' @param limit (\code{numeric}) Number of sequences to search for and return.
45+ # ' Max of 10,000. If you search for 6000 records, and only 5000 are found, you
46+ # ' will of course only get 5000 back.
47+ # ' @param entrez_query (\code{character}; length 1) An Entrez-format query to
48+ # ' filter results with. This is useful to search for sequences with specific
49+ # ' characteristics. The format is the same as the one used to seach genbank.
50+ # ' (\url{https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options})
51+ # '
52+ # '
53+ # ' @param hypothetical (\code{logical}; length 1) If \code{FALSE}, an attempt
54+ # ' will be made to not return hypothetical or predicted sequences judging from
55+ # ' accession number prefixs (XM and XR). This can result in less than the
56+ # ' \code{limit} being returned even if there are more sequences available,
57+ # ' since this filtering is done after searching NCBI.
58+ # ' @param verbose (\code{logical}) If \code{TRUE}, progress messages will be
59+ # ' printed.
60+ # '
3461# ' @examples
3562# '
3663# ' \dontrun{
@@ -54,6 +81,8 @@ ncbi_taxon_sample <- function(name = NULL, id = NULL, target_rank,
5481 seqrange = " 1:3000" , getrelated = FALSE ,
5582 fuzzy = TRUE , limit = 10 , entrez_query = NULL ,
5683 hypothetical = FALSE , verbose = TRUE ) {
84+ # Check that the "traits" package has been installed
85+ check_for_pkg(" traits" )
5786
5887 run_once <- function (name , id ) {
5988 default_target_max <- 20
@@ -115,7 +144,7 @@ ncbi_taxon_sample <- function(name = NULL, id = NULL, target_rank,
115144 along.with = between ))
116145 return (NULL )
117146 }
118- zoo :: rollapply( names(user_limits ), width = 2 , set_default_counts )
147+ lapply(seq_len(length( names(user_limits )) - 1 ), function ( i ) set_default_counts(names( user_limits )[ i : ( i + 1 )]))
119148 }
120149
121150 # Extend boundry values to adjacent undefined values - - - - - - - - - - - - - - - - - - - - - -
@@ -142,7 +171,7 @@ ncbi_taxon_sample <- function(name = NULL, id = NULL, target_rank,
142171 level_min_children <- get_level_limit(min_children , 0 , target_rank , interpolate_min ,
143172 extend_min = TRUE )
144173
145- # Recursivly sample taxon ------------------------------------------------------------------------
174+ # Recursively sample taxon ------------------------------------------------------------------------
146175 recursive_sample <- function (id , rank , name ) {
147176 cat(" Processing '" , name , " ' (uid: " , id , " , rank: " , as.character(rank ), " )" , " \n " ,
148177 sep = " " )
0 commit comments