wcvp_distribution()
retrieves geographic distribution from a pair of WCVP-like tables: one
names table and one distribution table. In regular use those tables can
come from wcvpdata::wcvp_checklist_names and
wcvpdata::wcvp_checklist_distribution, but the function
also works with user-supplied tables that follow the same schema.
This vignette uses a compact reproducible example so every code block runs without depending on external data packages.
make_distribution_names <- function() {
tibble(
plant_name_id = c(1, 2, 3, 4, 5, 6),
accepted_plant_name_id = c(NA, 3, NA, NA, 1, NA),
taxon_rank = c("Species", "Species", "Species", "Species", "Species", "Species"),
taxon_status = c("Accepted", "Synonym", "Accepted", "Accepted", "Synonym", "Accepted"),
family = c("Cactaceae", "Cactaceae", "Cactaceae", "Fagaceae", "Cactaceae", "Cactaceae"),
genus = c("Opuntia", "Nopalea", "Opuntia", "Quercus", "Opuntia", "Mammillaria"),
species = c("ficus-indica", "cochenillifera", "cochenillifera", "robur", "tuna", "elongata"),
taxon_name = c(
"Opuntia ficus-indica",
"Nopalea cochenillifera",
"Opuntia cochenillifera",
"Quercus robur",
"Opuntia tuna",
"Mammillaria elongata"
)
)
}
make_distribution_records <- function() {
tibble(
plant_locality_id = 1:7,
plant_name_id = c(1, 2, 3, 3, 4, 5, 6),
continent_code_l1 = c("8", "8", "8", "4", "1", "8", "8"),
continent = c(
"SOUTHERN AMERICA", "SOUTHERN AMERICA", "SOUTHERN AMERICA",
"NORTHERN AMERICA", "EUROPE", "SOUTHERN AMERICA", "SOUTHERN AMERICA"
),
region_code_l2 = c("83", "83", "83", "41", "10", "85", "83"),
region = c(
"Western South America", "Western South America", "Western South America",
"Mexico", "Europe", "Southern South America", "Western South America"
),
area_code_l3 = c("MEX", "PER", "COL", "MEX", "ESP", "GAL", "MEX"),
area = c("Mexico", "Peru", "Colombia", "Mexico", "Spain", "Galapagos", "Mexico"),
introduced = c(0, 0, 0, 1, 0, 0, 0),
extinct = c(0, 0, 0, 0, 0, 0, 0),
location_doubtful = c(0, 0, 0, 0, 0, 0, 0)
)
}
distribution_names <- make_distribution_names()
distribution_records <- make_distribution_records()
distribution_names
#> # A tibble: 6 × 8
#> plant_name_id accepted_plant_name_id taxon_rank taxon_status family genus
#> <dbl> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 NA Species Accepted Cactaceae Opuntia
#> 2 2 3 Species Synonym Cactaceae Nopalea
#> 3 3 NA Species Accepted Cactaceae Opuntia
#> 4 4 NA Species Accepted Fagaceae Quercus
#> 5 5 1 Species Synonym Cactaceae Opuntia
#> 6 6 NA Species Accepted Cactaceae Mammil…
#> # ℹ 2 more variables: species <chr>, taxon_name <chr>
distribution_records
#> # A tibble: 7 × 11
#> plant_locality_id plant_name_id continent_code_l1 continent region_code_l2
#> <int> <dbl> <chr> <chr> <chr>
#> 1 1 1 8 SOUTHERN AME… 83
#> 2 2 2 8 SOUTHERN AME… 83
#> 3 3 3 8 SOUTHERN AME… 83
#> 4 4 3 4 NORTHERN AME… 41
#> 5 5 4 1 EUROPE 10
#> 6 6 5 8 SOUTHERN AME… 85
#> 7 7 6 8 SOUTHERN AME… 83
#> # ℹ 6 more variables: region <chr>, area_code_l3 <chr>, area <chr>,
#> # introduced <dbl>, extinct <dbl>, location_doubtful <dbl>At species level, wcvp_distribution() parses input
names, matches them through the same backend used by
wcvp_matching(), resolves accepted names when possible, and
then joins the result to the distribution table.
species_out <- wcvp_distribution(
c("Nopalea cochenilliferaa", "Taxon inexistente"),
taxon_rank = "species",
wcvp_names = distribution_names,
wcvp_distributions = distribution_records
)
species_out |>
select(
submited_name,
matched_taxon,
accepted_taxon_name,
area_code_l3,
area,
distribution_status
)
#> # A tibble: 3 × 6
#> submited_name matched_taxon accepted_taxon_name area_code_l3 area
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Nopalea cochenilliferaa Opuntia cochen… Opuntia cochenilli… MEX Mexi…
#> 2 Nopalea cochenilliferaa Opuntia cochen… Opuntia cochenilli… COL Colo…
#> 3 Taxon inexistente <NA> <NA> <NA> <NA>
#> # ℹ 1 more variable: distribution_status <chr>The first query is fuzzy-matched to a synonym and then resolved to
the accepted taxon Opuntia cochenillifera. The second query
is preserved in the output with
distribution_status = "no_match".
When summarise_by_input = TRUE, the function collapses
the area-related columns to one row per input taxon.
species_summary <- wcvp_distribution(
c("Nopalea cochenilliferaa", "Taxon inexistente"),
taxon_rank = "species",
summarise_by_input = TRUE,
wcvp_names = distribution_names,
wcvp_distributions = distribution_records
)
species_summary |>
select(
submited_name,
accepted_taxon_name,
distribution_status,
area_codes,
distribution,
n_areas
)
#> # A tibble: 2 × 6
#> submited_name accepted_taxon_name distribution_status area_codes distribution
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Nopalea coche… Opuntia cochenilli… distribution_found COL - MEX Colombia - …
#> 2 Taxon inexist… <NA> no_match <NA> <NA>
#> # ℹ 1 more variable: n_areas <int>This format is useful for reporting and export because
distribution, areas, regions, and
continents are returned as collapsed text values separated
by " - ".
At genus level the function aggregates all accepted taxa in the genus and then summarises occurrences by area.
genus_out <- wcvp_distribution(
"Opuntia",
taxon_rank = "genus",
introduced = FALSE,
wcvp_names = distribution_names,
wcvp_distributions = distribution_records
)
genus_out |>
select(matched_taxon, area_code_l3, area, occurrence_type, distribution_status)
#> # A tibble: 2 × 5
#> matched_taxon area_code_l3 area occurrence_type distribution_status
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Opuntia COL Colombia native distribution_found
#> 2 Opuntia MEX Mexico native distribution_foundThe introduced Mexico record is excluded here because
introduced = FALSE.
Family-level lookup can also use fuzzy matching through
fozziejoin.
family_out <- wcvp_distribution(
"Cactacee",
taxon_rank = "family",
max_dist = 1,
wcvp_names = distribution_names,
wcvp_distributions = distribution_records
)
family_out |>
select(matched_taxon, match_distance, area_code_l3, area) |>
distinct()
#> # A tibble: 2 × 4
#> matched_taxon match_distance area_code_l3 area
#> <chr> <dbl> <chr> <chr>
#> 1 Cactaceae 1 MEX Mexico
#> 2 Cactaceae 1 COL ColombiaIf a species query cannot recover species-level distribution, the function can fall back to the matched genus without interrupting execution.
fallback_out <- wcvp_distribution(
"Opuntia especieinventada",
taxon_rank = "species",
wcvp_names = distribution_names,
wcvp_distributions = distribution_records
)
fallback_out |>
select(submited_name, matched_taxon, area_code_l3, area, distribution_status)
#> # A tibble: 3 × 5
#> submited_name matched_taxon area_code_l3 area distribution_status
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Opuntia especieinventada Opuntia MEX Mexico genus_distribution…
#> 2 Opuntia especieinventada Opuntia COL Colom… genus_distribution…
#> 3 Opuntia especieinventada Opuntia MEX Mexico genus_distribution…This fallback is marked with
distribution_status = "genus_distribution_fallback".
summarise_by_input = TRUE when you need one row per
submitted name.native, introduced,
extinct, and location_doubtful can be used as
filters before the final summary is returned.wcvpdata is installed, you can omit
wcvp_names and wcvp_distributions to use the
default backbone.