Summarize eBird Data by Grid Cell

Published

October 2, 2023

Setup

library(auk)

auk 0.6.0 is designed for EBD files downloaded after 2022-10-25. 
EBD data directory:  /home/steffi/Projects/Business/Matt/lb_curlew_distribution/Data/Raw 
eBird taxonomy version:  2022

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(sf)

Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE

library(assertr)

grid_10 <- read_rds("Data/Datasets/grid_10km.rds")
grid_20 <- read_rds("Data/Datasets/grid_20km.rds")
bcr <- read_rds("Data/Datasets/bcr_map.rds")

Best Practices

There are two levels of best practices we’ll be applying here.

First we’ll Filter the data to include only those checklists recommended by the eBird data best practices(https://cornelllabofornithology.github.io/ebird-best-practices/)

Keep only standard protocols of “traveling” and “stationary counts
Keep only < 5 km
Keep only < 5 hrs (300min)

Next we’ll Summarize the data to address the Spatial and Temporal Biases:

Spatial bias: “…most participants in citizen science surveys sample near their homes (Luck et al. 2004), in easily accessible areas such as roadsides (Kadmon, Farber, and Danin 2004), or in areas and habitats of known high biodiversity (Prendergast et al. 1993). A simple method to reduce the spatial bias is to create an equal area grid over the region of interest, and sample a given number of checklists from within each grid cell.”

Temporal bias: “…participants preferentially sample when they are available, such as weekends (Courter et al. 2013), and at times of year when they expect to observe more birds, notably during spring migration (Sullivan et al. 2014). To address the weekend bias, we recommend using a temporal scale of a week or multiple weeks for most analyses.”

So we use 10x10km or 20x20km grids and summarize over years

Data files

NOTE: These are the names of MY files

if you re-download the data you will have a different data version (this one is May-2023)
I restricted the dates of the files from May 2010 to Aug 2022
Make sure to update the files names to match the names of your files

Match all files starting with txt (if you followed the instructions in Scripts/01_setup this should show you your files)

list.files("Data/Raw/", pattern = "txt")

[1] "ebd_lobcur_201005_202208_relMay-2023.txt"
[2] "ebd_sampling_relMay-2023.txt"

Filter data

First we’ll create the filters and then tweak the outputs.

This creates (but doesn’t apply) the filters

filters <- auk_ebd("ebd_lobcur_201005_202208_relMay-2023.txt",
                   file_sampling = "ebd_sampling_relMay-2023.txt") |>
  
  # Keep only standard protocols (Best Practices)
  auk_protocol(protocol = c("Stationary", "Traveling")) |>
  
  # Restrict to suggested limits (Best Practices)
  auk_duration(c(0, 60*5)) |>
  auk_distance(distance = c(0, 5)) |>
  
  # Complete data (for getting a proxy of effort)
  auk_complete() |>
  
  # Date filters
  auk_date(date = c("*-05-01", "*-07-31")) |>
  
  # Year filters (only applicable to sampling checklists) |>
  auk_year(year = 2010:2022) |>
  
  # Spatial filters - Limit to BCR selection
  auk_bbox(st_bbox(st_transform(bcr, crs = 4326))) # bbox must be GPS lat/lon (4326)

Now, we’ll apply the filters and save the output

Takes time so only run if we haven’t before.
Delete the outputs to re-run

if(!file.exists("Data/Intermediate/ebird_lobcur_obs.txt")) {
  auk_filter(
    filters,
    file = "Data/Intermediate/ebird_lobcur_obs.txt",             # Output
    file_sampling = "Data/Intermediate/ebird_checklists.txt",    # Output
    overwrite = TRUE)
}

Now we’ll load in the filtered data and do some more filting to clean it up a bit more.

Convert all NA distances to 0 and filter again (distance filters didn’t apply to sampling - bug?). This can also take a little time as the sampling (checklist) data is pretty large

sampling <- read_sampling("Data/Intermediate/ebird_checklists.txt") |>
  mutate(effort_distance_km = replace_na(effort_distance_km, 0)) |>
  filter(effort_distance_km <= 5)

Convert NA distances to 0

obs <- read_ebd("Data/Intermediate/ebird_lobcur_obs.txt") |>
  mutate(effort_distance_km = replace_na(effort_distance_km, 0))

Finally…

Zero fill the data - Ensures that ‘complete’ checklists included as zero counts
Final checks to make sure filters worked as expected (assert() functions)
Keep only the columns which are useful to us
- Note There is a BCR column which we are not using as we will assign BCR membership to the grid cells based on overlap with the BCR shapefiles. This means it’s possible that the odd edge checklist may or may not be assigned to a grid cell with the same BCR as the observation itself, but we won’t worry about that.

full <- auk_zerofill(obs, sampling_events = sampling, collapse = TRUE) |>
  mutate(year = year(observation_date), month = month(observation_date)) |>
  assert(within_bounds(0, 5), effort_distance_km) |>
  assert(within_bounds(2010, 2022), year) |>
  assert(within_bounds(5, 7), month) |>
  select("year","observation_date", "checklist_id", 
         "observation_count", "species_observed", 
         "latitude", "longitude",
         "scientific_name")

write_rds(full, "Data/Intermediate/lobcur_complete.rds")

Take a quick peak at what this data looks like

full |> 
  slice(1:10)

# A tibble: 10 × 8
    year observation_date checklist_id observation_count species_observed
   <dbl> <date>           <chr>        <chr>             <lgl>           
 1  2014 2014-05-31       S18657134    0                 FALSE           
 2  2017 2017-06-26       S37821275    0                 FALSE           
 3  2017 2017-06-30       S37890425    0                 FALSE           
 4  2016 2016-06-23       S30365159    0                 FALSE           
 5  2015 2015-05-22       S23574660    0                 FALSE           
 6  2018 2018-06-04       S46317069    0                 FALSE           
 7  2018 2018-07-31       S47562117    0                 FALSE           
 8  2018 2018-07-08       S47070039    0                 FALSE           
 9  2018 2018-07-22       S47352459    0                 FALSE           
10  2011 2011-06-14       S47630389    0                 FALSE           
# ℹ 3 more variables: latitude <dbl>, longitude <dbl>, scientific_name <chr>

Summarize data by grid

First convert this filtered eBird data to spatial

ebird_sf <- read_rds("Data/Intermediate/lobcur_complete.rds") |>
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |> # Lat/lon are GPS (4326)
  st_transform(st_crs(grid_10)) # Now transform to match the grid data

Then join with grid data and summarize by grid

Get counts of checklists for each year for each grid (only concerned with yearly counts)
number of checklists total (total_checklists, based on the sum of checklists associated with each grid cell. If there were none, this grid cell would have an NA in year)
number of checklists with a bird detected (total_obs, based on species_observed which is a logical TRUE/FALSE column)
This takes time to run!

ebird_grid_10_sf <- st_join(grid_10, ebird_sf) |>
  summarize(total_checklists = sum(!is.na(year)),
            total_obs = sum(species_observed, na.rm = TRUE),
            presence = any(species_observed, na.rm = TRUE),
            date_min = min(observation_date),
            date_max = max(observation_date),
            .by = c("year", "grid_id"))
write_rds(ebird_grid_10_sf, "Data/Intermediate/lobcur_grid_10.rds")

ebird_grid_20_sf <- st_join(grid_20, ebird_sf) |>
  summarize(total_checklists = sum(!is.na(observation_date)),
            total_obs = sum(species_observed, na.rm = TRUE),
            presence = any(species_observed, na.rm = TRUE),
            date_min = min(observation_date),
            date_max = max(observation_date),
            .by = c("year", "grid_id"))
write_rds(ebird_grid_20_sf, "Data/Intermediate/lobcur_grid_20.rds")

Take a peak at what this kind of data looks like now (without the spatial characteristics)

ebird_grid_10_sf |>
  arrange(grid_id, year) |>
  st_drop_geometry() |> # Not necessary for a peak
  slice(1:10)

   year grid_id total_checklists total_obs presence date_min date_max
1    NA       1                0         0    FALSE     <NA>     <NA>
2    NA       2                0         0    FALSE     <NA>     <NA>
3    NA       3                0         0    FALSE     <NA>     <NA>
4    NA       4                0         0    FALSE     <NA>     <NA>
5    NA       5                0         0    FALSE     <NA>     <NA>
6    NA       6                0         0    FALSE     <NA>     <NA>
7    NA       7                0         0    FALSE     <NA>     <NA>
8    NA       8                0         0    FALSE     <NA>     <NA>
9    NA       9                0         0    FALSE     <NA>     <NA>
10   NA      10                0         0    FALSE     <NA>     <NA>

Centroids

Calculate grid centroids as lat/lon in a non-spatial data file. Most calculations don’t require a spatial data frame, and they’re slow to work with.

First calculate centroids for the two grid dimensions (recorded in lat/lon coords)

grid_10_centroid <- read_rds("Data/Datasets/grid_10km.rds") |>
  st_centroid() |>
  st_transform(4326) |>
  mutate(coords = as.data.frame(st_coordinates(geometry))) |>
  st_drop_geometry() |>
  unnest(coords) |>
  rename(lon = X, lat = Y)

Warning: st_centroid assumes attributes are constant over geometries

grid_20_centroid <- read_rds("Data/Datasets/grid_20km.rds") |>
  st_centroid() |>
  st_transform(4326) |>
  mutate(coords = as.data.frame(st_coordinates(geometry))) |>
  st_drop_geometry() |>
  unnest(coords) |>
  rename(lon = X, lat = Y)

Warning: st_centroid assumes attributes are constant over geometries

Now we’ll grab the checklist summaries, drop the spatial data and join to the grids centroids, and save the data

read_rds("Data/Intermediate/lobcur_grid_10.rds") |>
  st_drop_geometry() |>
  left_join(grid_10_centroid, by = "grid_id") |>
  write_rds("Data/Datasets/lobcur_grid_10_coords.rds")

read_rds("Data/Intermediate/lobcur_grid_20.rds") |>
  st_drop_geometry() |>
  left_join(grid_20_centroid, by = "grid_id") |>
  write_rds("Data/Datasets/lobcur_grid_20_coords.rds")