Data exploration and cleaning

This notebook looks for common sources of error and flags those records manual revision by the assessors who capture data from each country.

The output is a file showing the records that need manual review or corrections, if any

Get data

Load required libraries:

library(tidyr)
library(dplyr)
library(utile.tools)
library(stringr)
library(ggplot2)

Get Kobo raw output data:

kobo_output<-read.csv(file="International_Genetic_Indicator_testing_V_4.0_-_latest_version_-_False_-_2023-11-02-08-23-26.csv", sep=";", header=TRUE) %>%

## add  taxon column
mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
    # remove white space at the end of the name
    mutate(taxon=str_trim(taxon, "right"))

Filter out records which were marked as “not_approved” in the manual Kobo validation interface (this means country assessors determined the is something wrong with that particular record).

# check if any species is flagged as "validation_status_not_approved"
kobo_output %>%
      filter(X_validation_status=="validation_status_not_approved")  %>%
            select(country_assessment, name_assessor, taxon)

# omit those records from data:
kobo_output<- kobo_output %>%
            filter(X_validation_status!="validation_status_not_approved")

Filter out any sort of tests

# select likely columns to say "test"
cols= c("name_assessor", "email_assessor", "genus", "species", "subspecies_variety",
         "scientific_authority", "common_name", "GBIF_taxonID", "NCBI_taxonID", "time_populations")

# check for "test" or "template" on any of them
kobo_output %>% 
  filter(if_any(all_of(cols), ~ grepl("test", .)) |
         if_any(all_of(cols), ~ grepl("Template", .)))  %>% 
  select(country_assessment, name_assessor, genus, species)

# filter them out of dataset
kobo_output<- kobo_output %>% 
              filter(if_any(all_of(cols), ~ !grepl("test", .))) %>% 
              filter(genus!="Template")

Check for common data capture errors

Number of populations

In the form, -999 was used to mark taxa with unknown number of extant populations. This was used because answering the question was mandatory, so leaving it blank wasn’t possible. We have to change -999 to NA.

kobo_output<-kobo_output %>%
             mutate(n_extant_populations= na_if(n_extant_populations, -999))

We can now explore how many populations per species are still extant (still existing! NOT extinct!)?

summary(kobo_output$n_extant_populations)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##     0.00     1.00     3.00    63.28     8.00 47364.00       19

table(kobo_output$n_extant_populations)

## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
##     4   303   134    86    76    41    36    15    23    14    21    13     9 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
##    13     6    15     7     2    11    11     4     3     3     2     5     2 
##    26    27    28    30    33    34    35    36    37    38    40    42    43 
##     2     1     2     9     2     1     3     3     3     2     1     2     1 
##    45    47    48    50    53    54    55    57    60    65    68    69    72 
##     3     1     1     1     1     1     2     1     1     3     1     1     1 
##    74    75    77    79    80    85    87    89    91    95   100   104   105 
##     1     1     2     1     1     1     1     1     1     1     1     1     2 
##   113   116   122   124   127   137   146   148   150   173   176   180   186 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##   200   241   242   284   286   311   330   345   347   840 47364 
##     1     1     1     1     1     1     1     1     1     1     1

Plot histogram

ggplot(kobo_output, aes(x=n_extant_populations))+
      geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 19 rows containing non-finite values (`stat_bin()`).

Zoom Plot histogram

kobo_output %>%
    filter(n_extant_populations>=0, n_extant_populations<25) %>%

ggplot(., aes(x=n_extant_populations))+
      geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Once -999 was replaced by NA there should be no negative number of populations (if they are, they are typos that need to be corrected).

kobo_output %>%
      filter(n_extant_populations<0) %>%
      select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)

Show which taxa (if any) have 0 (zero) extant populations. Is this correct? needs to be manually checked

kobo_output %>%
      filter(n_extant_populations==0) %>%
      select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)

Show which species (if any) have 999 extant populations. Should this be -999? OR n_extinct pops??

kobo_output %>%
      filter(n_extant_populations==999) %>%
      select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)

Show which species (if any) have 999 EXTINCT populations. Should this be -999?

kobo_output %>%
      filter(n_extint_populations==999) %>%
      select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)

Put all taxa with weird number of populations that need to be checked together:

check_n_pops <- kobo_output %>% 
      # variables of interest
      select(country_assessment, name_assessor, taxon, n_extant_populations, n_extint_populations) %>%

      # same filters that discussed above
      filter(n_extant_populations<0 |
            n_extant_populations==0 | 
            n_extant_populations==999 | 
            n_extint_populations==999) %>% 

# add a column stating what needs to be checked:

       mutate(need_to_check="check number of extant or extint populations. Are 0 correct? should 999 be -999? are extant/extint confused?")

GBIF ID codes

Check GBIF

# check IDs
head(kobo_output$GBIF_taxonID)

## [1] "2440940" "2337607" "2477927" ""        "2422522" "5219516"

GBIF IDs tend to be 7 characters long. Some can be larger or shorter, but these seem to be exceptions. Therefore let’s flag any records where the GBIF Id is =/= 7 to manually check if it is correct.

kobo_output %>%
            filter(nchar(GBIF_taxonID)>0, nchar(GBIF_taxonID)!=7) %>%
  # show only relevant columns
            select(country_assessment, name_assessor, taxon, GBIF_taxonID)

Put them in their own happy df with a column stating what is the likely problem:

check_GBIF <- kobo_output %>%
              filter(nchar(GBIF_taxonID)>0, nchar(GBIF_taxonID)!=7) %>%
  # show only relevant columns
            select(country_assessment, name_assessor, taxon, GBIF_taxonID) %>%
# add a column stating what needs to be checked:

       mutate(need_to_check="check the GBIF taxonID. Either it looks plain different, or has more or less than 7 digits (most ids are 7 digits long, and this isn't, it could be an exception, or a mistake).")

Species names

Genus, species and subspecies should be a single word, check if there are cases where it isn’t. Only exception would be “var.” or “subsp.” in the subspecies_variety field:

kobo_output %>% 
  filter(grepl(" ", genus) | 
         grepl(" ", species) | 
         grepl(" ", subspecies_variety)) %>%
   filter(!grepl("var.", subspecies_variety)) %>%
   filter(!grepl("subsp.", subspecies_variety)) %>%
  # show only relevant columns
  # show only relevant columns
            select(country_assessment, name_assessor, taxon, genus, species, subspecies_variety)

Put them in their own happy df with a column stating what is the likely problem:

check_taxon_names <- kobo_output %>% 
  filter(grepl(" ", genus) | 
         grepl(" ", species) | 
         grepl(" ", subspecies_variety)) %>%
   filter(!grepl("var.", subspecies_variety)) %>%
   filter(!grepl("subsp.", subspecies_variety)) %>%
  # show only relevant columns
            select(country_assessment, name_assessor, taxon, genus, species, subspecies_variety) %>%
       mutate(need_to_check="check genus, species or subspecies_variety, we are targeting to have single words in each field, except in the ifraspecific names, where 'var.' and 'subsp.' (only) would be accepted. Other details or taxonomic notes should be added in the comments.")

Create a single file for assessors review:

to_check<-full_join(check_n_pops, check_GBIF) %>% full_join(check_taxon_names) %>%
  # show columns in desired order:
select(country_assessment, name_assessor, taxon, need_to_check, n_extant_populations,
         n_extint_populations, GBIF_taxonID, genus, species, subspecies_variety)

## Joining, by = c("country_assessment", "name_assessor", "taxon",
## "need_to_check")
## Joining, by = c("country_assessment", "name_assessor", "taxon",
## "need_to_check")

# save file:

write.csv(to_check, "kobo_output_tocheck_30april.csv", row.names = FALSE, fileEncoding = "UTF-8")

Session Info for reproducibility purposes:

sessionInfo()

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.4.1     stringr_1.4.0     utile.tools_0.2.7 dplyr_1.0.9      
## [5] tidyr_1.2.0      
## 
## loaded via a namespace (and not attached):
##  [1] highr_0.9        pillar_1.7.0     bslib_0.3.1      compiler_4.2.1  
##  [5] jquerylib_0.1.4  tools_4.2.1      digest_0.6.29    gtable_0.3.0    
##  [9] jsonlite_1.8.0   evaluate_0.15    lifecycle_1.0.3  tibble_3.1.7    
## [13] pkgconfig_2.0.3  rlang_1.0.6      cli_3.6.0        DBI_1.1.3       
## [17] rstudioapi_0.13  yaml_2.3.5       xfun_0.31        fastmap_1.1.0   
## [21] withr_2.5.0      knitr_1.39       generics_0.1.3   vctrs_0.5.2     
## [25] sass_0.4.1       grid_4.2.1       tidyselect_1.1.2 glue_1.6.2      
## [29] R6_2.5.1         fansi_1.0.3      rmarkdown_2.14   farver_2.1.1    
## [33] purrr_0.3.4      magrittr_2.0.3   scales_1.2.0     ellipsis_0.3.2  
## [37] htmltools_0.5.5  assertthat_0.2.1 colorspace_2.0-3 labeling_0.4.2  
## [41] utf8_1.2.2       stringi_1.7.6    munsell_0.5.0    crayon_1.5.1