Setup

Author

Steffi LaZerte

Published

July 4, 2024

This page explains the setup code defined in XX_setup.R which is sourced at the start of each workflow step.

Packages

# Motus data and data bases
library(motus)     # Fetch motus data
library(DBI)       # Data bases
library(arrow)     # feather files

# Tidy data manipulation
library(purrr)     # Loops
library(furrr)     # Parallel loops
library(dplyr)     # Manipulate data
library(tidyr)     # Transform data
library(tibble)    # Data frames (ish)
library(stringr)   # Text manipulation/search
library(readr)     # Read data
library(lubridate) # Deal with date/times
library(assertr)   # Check data quality

# Data display
library(ggplot2)   # Figures
library(ggrepel)   # Push labels away from points in figures
library(patchwork) # Combine figures
library(gt)        # HTML tables

# Spatial data
library(sf)        # Work with spatial data
library(rnaturalearth) # Get maps
library(ebirdst)   # Fetch species ranges
library(units)     # Deal with units of distance and speed
library(ggspatial) # Plot spatial data
library(lutz)      # Timezones by location

Motus credentials

Here we set the credentials for this session (so you don’t have to manually type it in every time)

This code expects that you store your Motus login and password as URBAN_USER and URBAN_PASSWORD in an .Renviron file

learn more about this file: https://rstats.wtf/r-startup.html#renviron
easily edit this file with usethis::edit_r_environ()

motus:::sessionVariable(name = "userLogin", val = Sys.getenv("URBAN_USER"))
motus:::sessionVariable(name = "userPassword", val = Sys.getenv("URBAN_PASSWORD"))

Define projects

Here we’re using a small set of pilot projects

Urban Ecology of Birds in the Lower mainland of BC #484
Montreal Urban Forests #551 (191 tags but 62 on resident cardinals)
Montreal Connectivity #373 (105 tags)
Montreal moult migrant research #168 (113 tags)

Additionally we’re using open motus projects selected via Selecting Projects.

projects <- c(484, 551, 373, 168, # From Barbara and Elizabeth
              352, 364, 393, 417, 464, 515, 607) # Open Motus projects
projects <- setNames(projects, projects)

Define constants

Here we define any cutoffs or constants we will be using

max_flight_speed is the maximum speed of travel we think is possible (with a buffer)
max_tower_dist is the maximum distance between towers/receivers within which a bird could be detected at both towers (and thus flight speed has no bearing)
bout_cutoff is the maximum amount of time between two runs at a specific station before those runs will be considered separate detection bouts.

max_flight_speed <- set_units(72, "m/s")
max_tower_dist <- set_units(50, "km")
bout_cutoff <- set_units(30, "min")

Data

Get species lists from NatureCounts taxonomy and join this with the species metadata from the Motus data bases.

We only need to do this to one database, as the metadata() data step in Download/Update data adds full species lists to all databases (so metadata is the same across all databases).

Species

Databases

Load existing databases

dbs <- map(projects, \(x) tagme(x, dir = "Data/01_Raw", update = FALSE))

Establish included species

We match existing species in the databases to the species list from NatureCounts
We rename scientific names to omit subspecies (e.g., myrtle yellow-rumped warbler) [required for later eBird range steps]

# naturecounts::nc_metadata() # Update naturecounts taxonomy lists as needed
species_list <- naturecounts::meta_species_taxonomy() |>
  filter(order_taxon %in% c("Passeriformes", "Piciformes"))

sp <- tbl(dbs[[1]], "tagDeps") |>
  # Only get species which are in our specific projects
  filter(projectID %in% projects) |>
  semi_join(x = tbl(dbs[[1]], "species"), y = _,
            by = c("id" = "speciesID")) |>
  select(-"group", -"sort") |>
  distinct() |>
  collect() |>
  # Keep only chosen species
  filter(id %in% species_list$species_id) |>
  arrange(id) |>
  # Fix subspecies names for joining with eBird (e.g., myrtle yellow-rumped warbler)
  mutate(
    scientific_motus = scientific,
    scientific = str_replace(scientific, "Setophaga coronata coronata", "Setophaga coronata"))

Functions

These are the custom functions used in this workflow. Some functions are displayed in the various workflow pages, but they are all sourced from XX_functions.R (as is the following).

remove_by_ID <- function(src, t, id_type = "batchID", ids) {
  if(length(ids) > 0) {
    if(t %in% DBI::dbListTables(src)) {
      n <- DBI::dbExecute(
        src, 
        glue::glue("DELETE FROM {`t`} WHERE {`id_type`} IN (",
                   glue::glue_collapse(ids, sep = ', '), 
                   ")"))
      if(n > 0) message(msg_fmt("  {n} rows deleted from {t}"))
    }
  }
}


filter_count <- function(dbs, filter) {
  before <- map(dbs, \(x) tbl(x, "runs") |> count(name = "n_before") |> collect()) |>
    list_rbind(names_to = "proj_id")
  after <- imap(dbs, \(x, y) {
    remove <- getRunsFilters(x, filter, y)
    tbl(x, "runs") |>
      anti_join(remove, by = c("runID", "motusTagID")) |>
      count(name = "n_after") |>
      collect()
  }) |>
    list_rbind()

  bind_cols(before, after) |>
    mutate(p_reduced = (n_before - n_after) / n_before * 100)
}

gt_theme <- function(data, ...) {
  data |>
    tab_options(
      table.border.top.width = px(3), 
      column_labels.font.weight = "bold",
      column_labels.border.bottom.width = px(3),
      data_row.padding = px(7),
      ...
    )
}
 
# Collect and transform ts values to date/times 
collect_ts <- function(db) {
  db |>
    collect() |>
    mutate(across(contains("ts", ignore.case = FALSE), as_datetime))
}

# Collect run/tag information for filters
collect_filter <- function(db) {
  db |>
    rename_with(~"motusTagID", .cols = any_of("tagID")) |>
    select(runID, motusTagID) |>
    mutate(probability = 0) |>
    collect()
}

anti_join_quick <- function(x, y, by = NULL, copy = FALSE, motus_filter = TRUE) {
  if(motus_filter) {
    y <- select(y, "runID", "tagID")
    if(is.null(by)) by <- c("runID", "tagID")
  }
  y <- mutate(y, ANTI = 1) 
  left_join(x, y, by = by, copy = copy) |>
    filter(is.na(ANTI)) |>
    select(-"ANTI")
}

# ---- custom_runs ----
custom_runs <- function(db) {
  # Replacement for NULL tsEnd values (i.e. today plus change)
  max_ts <- round(as.numeric(Sys.time()) + 1000)
  
  # Get Receivers
  r <- tbl(db, "recvDeps") |> 
    select("deviceID" = "deviceID", "recvDeployID" = "deployID", "tsStartRecv" = "tsStart", "tsEndRecv" = "tsEnd",
           "recvType" = "receiverType", "recvDeployLat" = "latitude", "recvDeployLon" = "longitude") |>
    mutate(tsEndRecv = if_else(is.na(tsEndRecv), max_ts, tsEndRecv))
  
  # Get tags
  t <- tbl(db, "tagDeps") |> 
    select("tagID", "tagDeployID" = "deployID", "speciesID", "tsStartTag" = "tsStart", "tsEndTag" = "tsEnd", "test") |>
    mutate(tsEndTag = if_else(is.na(tsEndTag), max_ts, tsEndTag))
  
  # Get batches
  b <- tbl(db, "batchRuns") |>
    distinct()
  
  # Combine with runs
  tbl(db, "runs") |>
    rename("tagID" = "motusTagID") |>
    # Add in tags by tagID *and* overlap of start/end of tag deployment with the beginning of a run
    left_join(t, by = join_by(tagID, between(tsBegin, tsStartTag, tsEndTag))) |>
    slice_max(order_by = tsStartTag, by = "runID", with_ties = FALSE) |> # If multiple deps, take only the one with max start
    # Add in batchRuns by runID (to get the batchID)
    left_join(b, by = "runID") |>
    slice_max(order_by = batchID, by = "runID", with_ties = FALSE) |> # If multiple batches, take only the latest one
    # Add in batches by batchID (to get the deviceID)
    left_join(tbl(db, "batches") |> select("batchID", "motusDeviceID"), by = "batchID") |>
    # Add in receivers by deviceID *and* overlap of receiver deployment time with the beginning of a run
    left_join(r, by = join_by(motusDeviceID == deviceID, between(tsBegin, tsStartRecv, tsEndRecv))) |>
    slice_max(order_by = tsStartRecv, by = "runID", with_ties = FALSE) |> # If multiple deps, take only the one with max start
    rename("recvDeviceID" = "motusDeviceID") |>
    select(-"batchIDbegin") |>
    distinct()
}

# ---- custom_hits ----
custom_hits <- function(db) {
  # Replacement for NULL tsEnd values (i.e. today plus change)
  max_ts <- round(as.numeric(Sys.time()) + 1000)
  
  # Get Receivers
  r <- tbl(db, "recvDeps") |> 
    select("deviceID", "recvDeployID" = "deployID", "tsStartRecv" = "tsStart", "tsEndRecv" = "tsEnd",
           "recvType" = "receiverType") |>
    mutate(tsEndRecv = if_else(is.na(tsEndRecv), max_ts, tsEndRecv))
  
  # Get tags
  t <- tbl(db, "tagDeps") |> 
    select("tagID", "tagDeployID" = "deployID", "speciesID", "tsStartTag" = "tsStart", "tsEndTag" = "tsEnd") |>
    mutate(tsEndTag = if_else(is.na(tsEndTag), max_ts, tsEndTag))  
  
  # Combine with the rest - And OMIT BAD filtered observations from previous step
  tbl(db, "hits") |>
    select(-"validated") |>
    # Add in runs and motusFilters
    left_join(tbl(db, "runs") |> select("runID", "motusFilter", "tagID" = "motusTagID"), by = "runID") |>
    # Add in tags by tagID *and* overlap of start/end of tag deployment with the beginning of a run
    left_join(t, by = join_by(tagID, between(ts, tsStartTag, tsEndTag))) |>
    # # Filter out "Bad runs" from previous step
    # left_join(tbl(db, "bad_data"), by = c("runID", "tagID")) |>
    # filter(is.na(BAD)) |>
    # select(-"BAD") |>
    # Add in batches by batchID (to get the deviceID)
    left_join(tbl(db, "batches") |> select("batchID", "motusDeviceID"), by = "batchID") |>
    # Add in receivers by deviceID *and* overlap of receiver deployment time with the beginning of a run
    left_join(r, by = join_by(motusDeviceID == deviceID, between(ts, tsStartRecv, tsEndRecv))) |>
    # Keep only relevant data
    select(-"batchID", -"tsStartRecv", -"tsEndRecv", -"tsStartTag", -"tsEndTag") |>
    rename("recvDeciveID" = "motusDeviceID")
}

load_hits <- function() {
  map(projects, \(x) {
    f <- filter(arws, proj_id == x) |>
      pull(file)
    open_dataset(f, format = "feather")
  })
}

plot_bouts <- function(trans, trans_clean = NULL, bouts, tagDeployID, save = FALSE) {
  
  trans <- filter(trans, tagDeployID == .env$tagDeployID)
  if(is.null(trans_clean)) trans_clean <- trans
  trans_clean <- filter(trans_clean, tagDeployID == .env$tagDeployID)
  bouts <- filter(bouts, tagDeployID == .env$tagDeployID)

  g1 <- plot_map(trans)
  
  lon <- range(c(trans$lon1, trans$lon2)) |> diff()
  lat <- range(c(trans$lat1, trans$lat2)) |> diff()
  ratio <- abs(lat) / abs(lon)

  g2 <- plot_coord(trans, trans_clean, bouts, "lat")
  g3 <- plot_coord(trans, trans_clean, bouts, "lon")
  
  #if(ratio < 2) g_coord <- g2 + g3 else g_coord <- g2 / g3
  g_coord <- g2 / g3
  g_coord <- g_coord + plot_layout(guides = "collect")
  
  #if(ratio > 1) g <- g1 + g_coord else g <- g1 / g_coord
  g <- g1 + g_coord
  g <- g + 
    plot_annotation(title = paste0("tagDeployID: ", tagDeployID, " (", 
                                   as_date(min(bouts$timeBegin)), " to ", 
                                   as_date(max(bouts$timeEnd)), ")"))
  
  if(save) {
    ggsave(plot = g, filename = paste0("check_", tagDeployID, ".pdf"), width = 20, height = 12)
  }
  g
}

plot_map <- function(trans, tagDeployID = NULL) {
  
  if(!is.null(tagDeployID)) trans <- filter(trans, tagDeployID == .env[["tagDeployID"]])
  if(!"problem" %in% names(trans)) {
    trans$problem <- trans$problem_fast <- trans$problem_manual <- FALSE
    trans$resolved <- TRUE
  }

  recvs <- select(trans, stn1, lat1, lon1, stn2, lat2, lon2) |>
    pivot_longer(cols = c("stn1", "stn2", "lat1", "lat2", "lon1", "lon2"), 
                 names_to = c(".value", "pair"), 
                 names_pattern = "([a-z]{2,3})(\\d+)") |>
    select(-"pair") |>
    distinct()

  bb <- trans |>
    summarize(xmin = min(c(lon1, lon2)), xmax = max(c(lon1, lon2)),
              ymin = min(c(lat1, lat2)), ymax = max(c(lat1, lat2)))
  
  ry <- diff(range(bb[3:4]))
  rx <- diff(range(bb[1:2]))
  
  if(rx > 31 & any(!trans$problem)) {
    bb <- filter(trans, !problem) |>
      summarize(xmin = min(c(lon1, lon2)), xmax = max(c(lon1, lon2)),
                ymin = min(c(lat1, lat2)), ymax = max(c(lat1, lat2)))

    ry <- diff(range(bb[3:4]))
    rx <- diff(range(bb[1:2]))
  }
  
  if(rx < 0.5) {
    add <- (0.5 - rx)/2
    bb[1:2] <- bb[1:2] + (c(-1, 1) * add)
  }
  if(ry < 2) {
    add <- (2 - ry)/2
    bb[3:4] <- bb[3:4] + (c(-1, 1) * add)
  }
  
  ry <- diff(range(bb[3:4]))
  rx <- diff(range(bb[1:2]))
  ratio <- ry/rx
  
  if(ratio > 2) {
    rx1 <- ry/2
    add <- (rx1 - rx)/2
    bb[1:2] <- bb[1:2] + (c(-1, 1) * add)
  } else if (ratio < 2) {
    ry1 <- rx * 2
    add <- (ry1 - ry)/2
    bb[3:4] <- bb[3:4] + (c(-1, 1) * add)
  }
  
  if(any(trans$problem_fast | trans$problem_manual)) {
    labs <- trans |>
      filter(problem_fast | problem_manual) |>
      summarize(
        lon = lon1, 
        lat = lat1,
        recv_pair = recv_pair[1],
        problem_chr = pmap_chr(list(problem_fast, problem_manual),
                               \(x, y) paste0(c("too fast", "manual")[c(x, y)], collapse = ", ")),
        nudge_x = sign(lon2 - lon1) * ((bb[[2]] - bb[[1]]) * 0.1),
        nudge_y = sign(lat1 - lat2) * ((bb[[4]] - bb[[3]]) * 0.05), 
        .by = trans_id) |>
      select(-trans_id) |>
      distinct()
  } else labs <- data.frame()
  
  g <- ggplot(trans, aes(x = lon1, y = lat1)) +
    annotation_map_tile(type = "cartolight", zoomin = 0) +
    geom_spatial_segment(
      data = filter(trans, resolved, problem),
      aes(xend = lon2, yend = lat2), colour = "grey", linewidth = 2,
      arrow = arrow(length = unit(10, "pt"), type = "closed"), crs = 4326, na.rm = TRUE) +
    coord_sf(xlim = unlist(bb[1:2]), ylim = unlist(bb[3:4]), crs = 4326)
  
  if(nrow(labs) > 0) {
    g <- g + geom_spatial_label(data = labs, 
                               aes(x = lon, y = lat, label = problem_chr), 
                               nudge_x = labs$nudge_x, nudge_y = labs$nudge_y, 
                               colour = "grey", crs = 4326)
  }
  g +
    geom_spatial_segment(
      data = filter(trans, (resolved & !problem) | (!resolved)),
      aes(xend = lon2, yend = lat2, colour = mid_date, linetype = problem),
      arrow = arrow(length = unit(10, "pt"), type = "closed"), crs = 4326, na.rm = TRUE) +
    geom_spatial_label_repel(data = recvs, aes(label = stn, x = lon, y = lat), 
                             crs = 4326) +
    
    scale_size_date(range = c(6, 3)) +
    scale_linetype_manual(values = c("FALSE" = "solid", "TRUE" = "dotted"),
                          guide = "none") +
    scale_linewidth_manual(values = c("FALSE" = 0.75, "TRUE" = 1.55),
                           guide = "none") +
    scale_colour_viridis_c(option = "inferno", transform = "date", 
                           begin = 0.20, end = 0.85) +
    guides(colour = guide_colourbar(reverse = TRUE))
}

plot_coord <- function(trans, trans_clean, bouts, coord) {
  lim <- range(bouts$dateBegin)
  lim[2] <- lim[2] + difftime(lim[2], lim[1], units = "days") * 0.1
  lim <- as_datetime(lim)
  
  if(!"bad" %in% names(trans)) trans$bad <- FALSE
  if(!"bad" %in% names(trans_clean)) trans_clean$bad <- FALSE
  if(!"bad" %in% names(bouts)) bouts$bad <- FALSE
  if(!"problem" %in% names(trans)) trans$problem <- FALSE
  if(!"problem" %in% names(trans_clean)) trans_clean$problem <- FALSE
  
  bouts <- mutate(bouts, stn_group = factor(stn_group))
  
  recvs <- select(bouts, recvDeployLat, recvDeployLon, stn_group) |>
    distinct()
  pal <- viridis::magma(n = length(recvs$stn_group), begin = 0.10, end = 0.9) |>
    set_names(levels(recvs$stn_group))
  pal_txt <- farver::decode_colour(pal, "rgb", "hcl")
  pal_txt <- if_else(pal_txt[, "l"] > 50, "black", "white") |>
    set_names(levels(recvs$stn_group))
  
  c1 <- str_subset(names(bouts), regex(coord, ignore_case = TRUE))
  c2 <- paste0(coord, 1)
  c3 <- paste0(coord, 2)
  
  b <- filter(bouts, !bad)
  b_bad <- filter(bouts, bad)
  
  t <- trans_clean #filter(trans, !problem | (problem & !resolved))
  t_bad <- anti_join(trans, trans_clean, by = c("id1", "id2"))
  
  ggplot(b, aes(x = timeBegin, y = .data[[c1]])) +
    theme_bw() +
    
    # Good bouts (or unresolved)
    geom_segment(aes(xend = timeEnd, colour = stn_group), na.rm = TRUE, linewidth = 3) +
    geom_point(data = arrange(b, desc(stn_group)), aes(colour = stn_group), na.rm = TRUE, size = 2) +
    
    # Bad bouts (resolved)
    geom_segment(data = b_bad, aes(xend = timeEnd), colour = "grey", na.rm = TRUE, linewidth = 3) +
    geom_point(data = arrange(b_bad, desc(stn_group)), colour = "grey", na.rm = TRUE, size = 2) +
    
    # Good trans (or unresolved)
    geom_segment(data = t, aes(
      x = e1, xend = b2, y = .data[[c2]], yend = .data[[c3]], linetype = problem), na.rm = TRUE) +
    
    # Bad trans (resolved)
    geom_segment(data = t_bad, aes(
      x = e1, xend = b2, y = .data[[c2]], yend = .data[[c3]], linetype = problem), 
      colour = "grey", na.rm = TRUE) +
    
    geom_label_repel(data = recvs, aes(label = stn_group, y = .data[[c1]], x = lim[2], 
                                       fill = stn_group, colour = stn_group),
               size = 2.5, direction = "y", point.size = NA, box.padding = 0, label.padding = 0.15,
               colour = pal_txt[recvs$stn_group]) +
    scale_fill_manual(values = pal) +
    scale_colour_manual(values = pal) +
    scale_x_datetime(limits = lim)
}

plot_check <- function(hits, tagDeployID) {
  h <- filter(h, tagDeployID == .env[["tagDeployID"]]) |>
    arrange(time)

  if(nrow(h) > 0) {

    sig_range <- function(limits) {
      diff <- abs(diff(limits))
      case_when(
        diff < 5 ~ limits + c(-7.5, 7.5),
        diff < 10 ~ limits + c(-5, 5),
        diff < 15 ~ limits + c(-2.5, 2.5),
      TRUE ~ limits)
    }

    g <- list()
    for(i in unique(h$trans_id)) {
      hh <- filter(h, trans_id == i) |>
        arrange(time) |>
        mutate(problem_pair = factor(problem_pair, levels = unique(problem_pair)))
        
      h_lab <- hh |>
        summarize(time = min(time), sig = median(sig), 
                  .by = c("recvDeployID", "problem_pair")) |>
        mutate(recvDeployID = as.factor(recvDeployID))
      
      g1 <- ggplot(data = hh, aes(x = time, y = sig, colour = factor(recvDeployID),
                                shape = ant)) +
        theme_bw() +
        theme(legend.position = "none", aspect.ratio = 1, plot.title = element_text(hjust = 0.5)) +
        geom_point(size = 1) +
        geom_line(stat = "smooth", alpha = 0.5, method = "gam", formula = y ~ s(x, k = 10, bs = "cs")) +
        geom_label(data = h_lab, aes(label = recvDeployID, shape = NULL), size = 3, hjust = -0.2) +
        scale_colour_viridis_d(option = "magma", end = 0.8) +
        scale_y_continuous(limits = sig_range) +
        scale_x_datetime(date_labels = "%H:%M") +
        facet_wrap(~ problem_pair, scales = "free", axes = "all") +
        labs(title = paste0("Transition ", i))
      g[[length(g) + 1]] <- g1
    }
    
    if(length(g) > 1) ncol <- 2 else ncol <- 1
    
    g <- wrap_plots(g, ncol = ncol) +
      plot_layout(heights = 1) +
      plot_annotation(title = paste0("tagDeployID: ", tagDeployID)) &
      theme(plot.margin = unit(c(1,0,1,1), units = "pt"))
  } else g <- NULL
  g
}


# Create bouts from runs by adding them together and then resorting.
# The resorting helps to get at bouts that overlap each other.
create_bouts <- function(x, cutoff) {
  x1 <- x |> 
    mutate(
      lag1 = as_units(difftime(lead(timeBegin), timeEnd, units = "min")),
      lag2 = as_units(difftime(lead(timeEnd), timeEnd, units = "min")),
      lag = pmin(lag1, lag2),
      prev_lag = lag(lag, default = set_units(Inf, "min")),
      start = prev_lag >= .env$cutoff,
      bout_id = cumsum(start))
  
  x1 |>
    group_by(bout_id) |>
    summarize(timeBegin = min(timeBegin), timeEnd = max(timeEnd), 
              runID = list(unlist(runID)),
              n_runs = n(), 
              len = list(unlist(len)),
              ant = list(unlist(ant)),
              total_time = sum(difftime(timeEnd, timeBegin, units = "min")),
              .groups = "drop")
}

calc_trans <- function(x) {
  x |>
    rename(id1 = id) |>
    mutate(next_stn = lead(stn_group), 
           id2 = lead(id1), .by = "tagDeployID") |>
    filter(stn_group != next_stn) |>
    select(tagDeployID, id1, id2)
}

# ---- find_overlaps ----
find_overlaps <- function(x2, type = "logical") {
  #if(3551 %in% x2$stn_group) browser()
  #           comp B/E2  vs. B/E1
  
  m1 <- outer(x2$timeBegin, x2$timeBegin, '>=')
  m2 <- outer(x2$timeEnd, x2$timeEnd, '<=')
  
  m3 <- outer(x2$timeBegin, x2$timeEnd, '<=')
  m4 <- outer(x2$timeEnd, x2$timeBegin, '>=')
  
  ovlps <- (m1 & m3) | (m2 & m4)
  ovlps <- ovlps | t(ovlps) # either id1/id2 or id2/id1
  
  if(type == "df") {
    nms <- paste0(x2$stn_group, "_", x2$bout_id)
    dimnames(ovlps) <- list(nms, nms)
    r <- expand_grid(id1 = nms, id2 = nms) |>
      mutate(overlap = as.vector(ovlps)) |>
      filter(overlap, id1 != id2)
  } else if(type == "logical") {
    r <- any(colSums(ovlps) > 1)
  }
  r
}

# ---- load_runs ----
load_runs <- function() {
  noise_runs <- open_dataset("Data/02_Datasets/noise_runs.feather", format = "feather")
  noise_hits <- open_dataset("Data/02_Datasets/noise_hits.feather", format = "feather")
  
  open_dataset("Data/02_Datasets/runs", format = "feather") |>
    anti_join(noise_runs, by = c("runID")) |>
    anti_join(noise_hits, by = c("runID")) |>
    filter(len > 2) |>
    select(runID, timeBegin, timeEnd, tagID, ant, len, motusFilter, tagDeployID, 
           speciesID, starts_with("recvDeploy"), matches("date|month|year")) |>
    arrange(timeBegin) |>
    collect()
}

# ---- create_overlapping ----
create_overlapping <- function(bouts) {
  nest(bouts, data = -tagDeployID) |>
    mutate(overlaps = map(data, \(x) find_overlaps(x, "df"), .progress = interactive())) |>
    select(-"data") |>
    unnest("overlaps") |>
    mutate(id1 = paste0(tagDeployID, "_", id1),
           id2 = paste0(tagDeployID, "_", id2))
}

# ---- create_trans ----
create_trans <- function(bouts, overlapping_bouts, dist) {
  
  # Get transitions between stations
  trans <- bind_rows(
    bouts |> 
      arrange(timeBegin) |>
      calc_trans(),
    bouts |> 
      arrange(timeEnd) |>
      calc_trans()
  ) |>
    distinct() |>
    
    # Add in any missing transitions with overlapping bouts
    full_join(overlapping_bouts, by = c("tagDeployID", "id1", "id2")) |>
    # Add in data related to the first station
    left_join(select(bouts, "stn1" = "stn_group", "id1" = "id", 
                     "b1" = "timeBegin", "e1" = "timeEnd", 
                     "recv1" = "recvDeployID", "lat1" = "recvDeployLat", "lon1" = "recvDeployLon"), 
              by = "id1") |>
    # Add in data related to the second station
    left_join(select(bouts, "stn2" = "stn_group", "id2" = "id", 
                     "b2" = "timeBegin", "e2" = "timeEnd", 
                     "recv2" = "recvDeployID", "lat2" = "recvDeployLat", "lon2" = "recvDeployLon"), 
              by = "id2") |>
    
    # Identify transition pairs
    mutate(recv_pair = paste0(recv1, "_", recv2)) |>
    
    # Add in distance between stations
    left_join(select(dist, "recv_pair", "next_dist", "min_time"),
              by = "recv_pair") |>
    
    # Calculate the time and speed taken to move between stations
    mutate(trans_id = row_number(),
           overlap = replace_na(overlap, FALSE),
           lag1 = as_units(difftime(b2, e1, units = "min")),
           lag2 = as_units(difftime(e2, e1, units = "min")),
           time_diff = pmin(lag1, lag2),
           time_diff = set_units(time_diff, "s"),
           speed = set_units(next_dist, "m") / time_diff,
           time_diff = as.duration(as.numeric(time_diff)),
           time_diff = if_else(overlap, duration(0), time_diff),
           # Mid-point date
           mid_date = as_date(e1 + (b2 - e1)/2))
  
  if(nrow(trans) > 1) {
    trans <- trans |>
      # Get rid of duplicate transitions when have overlapping pairs
      mutate(id_pair = map2_chr(id1, id2, \(x, y) paste0(sort(c(x, y)), collapse = " - "))) |>
      arrange(b1) |>
      slice(1, .by = "id_pair") |>
      select(-"id_pair") |>
      # Seasonal details
      mutate(migration = case_when(year(e1) != year(b2) ~ NA,
                                   month(e1) %in% 8:12 & month(b2) %in% 8:12 ~ "south",
                                   month(e1) %in% 2:7 & month(b2) %in% 2:7 ~ "north",
                                   TRUE ~ NA),
             connected = speed > set_units(5, "m/s"))
  }  else trans <- mutate(trans, migration = NA_character_, connected = NA)
  
  trans
}

# ---- id_problems ----
id_problems <- function(trans, problems_manual = NULL) {
  
  # Identify suspiciously short bouts related to transitions
  #b <- filter(bouts, n_runs < 5, len_max < 10) |>
  # pull(id)  
  
  # Add manual problems
  if(!is.null(problems_manual)) {
    trans_fix1 <- semi_join(trans, rename(problems_manual, "stn1" = "stn_group"),
                           by = join_by("tagDeployID", "stn1", within("b1", "e1", "start", "end"))) |>
      mutate(problem_manual = TRUE)
    
    trans_fix2 <- semi_join(trans, rename(problems_manual, "stn2" = "stn_group"),
                            by = join_by("tagDeployID", "stn2", within("b2", "e2", "start", "end"))) |>
      mutate(problem_manual = TRUE)
    
    trans <- trans |>
      mutate(problem_manual = FALSE) |>
      rows_update(trans_fix1, by = c("tagDeployID", "stn1", "trans_id")) |>
      rows_update(trans_fix2, by = c("tagDeployID", "stn2", "trans_id"))
      
  } else trans$problem_manual <- FALSE
  
  trans |>
    mutate(
      # Identify problematic transitions (overlaps or too fast)
      problem_fast = next_dist >= max_tower_dist & (time_diff < min_time | overlap),
      
      # TODO: Results in 300+ ids to check... leave for now
      problem_short_bout = FALSE, # TODO: if using, replace with id1 %in% b | id2 %in% b, (add bouts in as arg)
      
      # Direction
      lat_diff = lat2 - lat1, 
      problem_direction = (migration == "south" & lat_diff > 1) |
        (migration == "north" & lat_diff < -1),
      problem_direction = replace_na(problem_direction, FALSE),
      
      # Overall
      problem = problem_fast | problem_short_bout | problem_manual | problem_direction) |>
    select(-matches("\\.(x|y)$"))
}

# ---- resolve_stns ----
resolve_stns <- function(bouts, trans, problems_manual = NULL) {

  b <- summarize(
    bouts,
    len_max = max(len_max),
    #n_bouts = n(),
    #n_runs = sum(n_runs),
    total_time = sum(total_time),
    .by = c("tagDeployID", "stn_group"))
  
  t <- filter(trans, problem) |>
    select(tagDeployID, trans_id, stn1, stn2, id1, id2, b1, e1, b2, e2, lat_diff, contains("problem")) |>
    pivot_longer(cols = c("stn1", "stn2", "id1", "id2", "b1", "e1", "b2", "e2"), 
                 names_to = c(".value", "pair"), 
                 names_pattern = "([a-z]{1,3})(\\d+)") |>
    rename("stn_group" = "stn") |>
    left_join(b, by = c("tagDeployID", "stn_group")) |>
    arrange(tagDeployID, trans_id, total_time) |>
    select(tagDeployID, trans_id, contains("problem"), stn_group, id, 
           b, e,
           len_max, total_time)
  
  # Mark the station to omit within a group that was able to have a (resolved) station
  
  # Mark manually identified problems
  if(!is.null(problems_manual)) {
    t <- left_join(t, select(problems_manual, "tagDeployID", "stn_group", "start", "end", "omit" = "problem_manual"), 
                   by = join_by("tagDeployID", "stn_group", within("b", "e", "start", "end"))) |>
      select(-"b", -"e", -"start", -"end")
  } else t$omit <- NA
  
  t |>
    mutate(
      # Mark directional and short bout problems as FALSE (need to be manually checked)
      omit = if_else(is.na(omit) & (problem_direction | problem_short_bout), FALSE, NA),
      # Choose station to omit otherwise
      omit = if_else(is.na(omit), 
                     total_time <= (suppressWarnings(max(total_time)) / 10) | 
                       len_max <= (suppressWarnings(max(len_max)) / 10), 
                     omit), 
      .by = c("tagDeployID", "trans_id")) |>
    mutate(resolved = any(omit), .by = c("tagDeployID", "trans_id"))
}

#' Convert unit vectors to regular numeric vectors
#'
#' @param x Vector (or column)
#' @param units Units to ensure they match
#'
flatten_units <- function(x, units) {
  if(inherits(x, "difftime")) x <- as_units(x)
  set_units(x, units, mode = "standard") |>
    as.numeric()
}