wcvp_matching() is
the main reconciliation function in wcvpmatch. It takes
parsed names or a minimal genus/species table, matches those names
against a WCVP-like backbone, and returns both the matched taxon and
accepted-name context.
This vignette uses a compact in-memory backbone so every example runs quickly and reproducibly.
make_matching_backbone <- function() {
tibble(
genus = c("Aniba", "Jaltomata", "Veronica", "Veronica"),
species = c("heterotepala", "sagastegui", "vulcanica", "spathulata"),
infraspecific_rank = NA_character_,
infraspecies = NA_character_,
plant_name_id = c(1, 2, 10, 200),
taxon_name = c(
"Aniba heterotepala",
"Jaltomata sagastegui",
"Veronica vulcanica",
"Veronica spathulata"
),
taxon_authors = c("A.Author", "B.Author", "C.Author", "D.Author"),
taxon_status = c("Accepted", "Accepted", "Synonym", "Accepted"),
accepted_plant_name_id = c(1, 2, 200, 200)
)
}
matching_backbone <- make_matching_backbone()
matching_backbone
#> # A tibble: 4 × 9
#> genus species infraspecific_rank infraspecies plant_name_id taxon_name
#> <chr> <chr> <chr> <chr> <dbl> <chr>
#> 1 Aniba heterotepa… <NA> <NA> 1 Aniba het…
#> 2 Jaltomata sagastegui <NA> <NA> 2 Jaltomata…
#> 3 Veronica vulcanica <NA> <NA> 10 Veronica …
#> 4 Veronica spathulata <NA> <NA> 200 Veronica …
#> # ℹ 3 more variables: taxon_authors <chr>, taxon_status <chr>,
#> # accepted_plant_name_id <dbl>classify_spnames() standardizes the incoming names
before matching. It is the recommended entry point when you start from
raw taxon strings.
parsed_names <- classify_spnames(
c("Aniba heterotepala", "Jaltometa sagasteguii", "Veronica vulcanica")
)
parsed_names |>
select(Input.Name, Orig.Genus, Orig.Species, Rank)
#> # A tibble: 3 × 4
#> Input.Name Orig.Genus Orig.Species Rank
#> <chr> <chr> <chr> <dbl>
#> 1 Aniba heterotepala Aniba heterotepala 2
#> 2 Jaltometa sagasteguii Jaltometa sagasteguii 2
#> 3 Veronica vulcanica Veronica vulcanica 2The example below shows three common outcomes:
matched <- wcvp_matching(
parsed_names,
target_df = matching_backbone,
allow_duplicates = TRUE,
max_dist = 2,
method = "osa",
add_name_distance = TRUE,
output_name_style = "snake_case"
)
matched |>
select(
input_name,
matched_taxon_name,
accepted_taxon_name,
taxon_status,
matched_dist
)
#> # A tibble: 3 × 5
#> input_name matched_taxon_name accepted_taxon_name taxon_status matched_dist
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 Aniba hetero… Aniba heterotepala Aniba heterotepala accepted 0
#> 2 Jaltometa sa… Jaltomata sagaste… Jaltomata sagasteg… accepted 2
#> 3 Veronica vul… Veronica vulcanica Veronica spathulata synonym 0The logical stage flags show how each input was recovered. These are useful when you want to audit matching behavior or separate exact from fuzzy recoveries.
matched |>
select(
input_name,
direct_match,
genus_match,
fuzzy_match_genus,
direct_match_species_within_genus,
suffix_match_species_within_genus,
fuzzy_match_species_within_genus,
matched
)
#> # A tibble: 3 × 8
#> input_name direct_match genus_match fuzzy_match_genus direct_match_species…¹
#> <chr> <lgl> <lgl> <lgl> <lgl>
#> 1 Aniba heter… TRUE NA NA NA
#> 2 Jaltometa s… FALSE FALSE TRUE FALSE
#> 3 Veronica vu… TRUE NA NA NA
#> # ℹ abbreviated name: ¹direct_match_species_within_genus
#> # ℹ 3 more variables: suffix_match_species_within_genus <lgl>,
#> # fuzzy_match_species_within_genus <lgl>, matched <lgl>In this example, Aniba heterotepala is recovered
directly, while Jaltometa sagasteguii is resolved through
fuzzy matching.
wcvp_matching() returns both the matched taxon and the
accepted taxon. That is important when the submitted name is a
synonym.
matched |>
filter(input_name == "Veronica vulcanica") |>
select(
input_name,
matched_taxon_name,
accepted_taxon_name,
matched_taxon_authors,
accepted_taxon_authors,
taxon_status,
is_accepted_name
)
#> # A tibble: 1 × 7
#> input_name matched_taxon_name accepted_taxon_name matched_taxon_authors
#> <chr> <chr> <chr> <chr>
#> 1 Veronica vulcani… Veronica vulcanica Veronica spathulata C.Author
#> # ℹ 3 more variables: accepted_taxon_authors <chr>, taxon_status <chr>,
#> # is_accepted_name <lgl>Here the matched taxon is Veronica vulcanica, but the
accepted taxon is Veronica spathulata.
Duplicate input rows are allowed when
allow_duplicates = TRUE. In that case the function
preserves row identity with input_index.
duplicate_input <- tibble(
Genus = c("Aniba", "Aniba"),
Species = c("heterotepala", "heterotepala"),
Input.Name = c("Aniba heterotepala", "Aniba heterotepala")
)
wcvp_matching(
duplicate_input,
target_df = matching_backbone,
allow_duplicates = TRUE,
output_name_style = "snake_case"
) |>
select(input_index, input_name, matched_taxon_name, accepted_taxon_name)
#> # A tibble: 2 × 4
#> input_index input_name matched_taxon_name accepted_taxon_name
#> <int> <chr> <chr> <chr>
#> 1 1 Aniba heterotepala Aniba heterotepala Aniba heterotepala
#> 2 2 Aniba heterotepala Aniba heterotepala Aniba heterotepalaFor larger jobs, profile = TRUE attaches timing
information for each stage of the pipeline.
profiled <- wcvp_matching(
parsed_names,
target_df = matching_backbone,
allow_duplicates = TRUE,
max_dist = 2,
method = "osa",
add_name_distance = TRUE,
output_name_style = "snake_case",
profile = TRUE
)
attr(profiled, "timings") |>
select(stage, rows, elapsed_seconds)
#> # A tibble: 16 × 3
#> stage rows elapsed_seconds
#> <chr> <int> <dbl>
#> 1 check_df_format 3 0.00500
#> 2 deduplicate_input 3 0.00400
#> 3 check_df_consistency 3 0.00400
#> 4 get_db NA 0.00400
#> 5 prefilter_target_by_genus 3 0.0250
#> 6 wcvp_direct_match 3 0.0570
#> 7 wcvp_genus_match 1 0.0140
#> 8 wcvp_fuzzy_match_genus 1 0.0440
#> 9 wcvp_direct_match_species_within_genus 1 0.0150
#> 10 wcvp_suffix_match_species_within_genus 1 0.0250
#> 11 wcvp_fuzzy_match_species_within_genus 1 0.0350
#> 12 prepare_taxonomic_context_data 3 0.0110
#> 13 add_taxonomic_context 3 0.00900
#> 14 add_name_distance 3 0.00100
#> 15 standardize_output_names 3 0.00100
#> 16 total 3 0.314classify_spnames() when your input is raw
text.allow_duplicates = TRUE for production data that
may contain repeated names.output_name_style = "snake_case" if you want easier
downstream use with dplyr.add_name_distance = TRUE when you want a compact
numeric summary of how far each submitted name is from the matched
name.