source("XX_setup.R")
noise_runs <- open_dataset("Data/02_Datasets/noise_runs.feather", format = "feather")
runs <- open_dataset("Data/02_Datasets/runs", format = "feather") |>
anti_join(noise_runs)
hits <- load_hits() |>
map(\(x) anti_join(x, noise_runs, by = c("runID", "recvDeployID", "tagDeployID")))Fine-scale Filtering
Here we perform fine-scale filtering which involves more assessments of potential issues.
Many of these steps rely on hit-level data, as opposed to run-level data from the previous steps.
Setup
We’ll also use a modified version of the motusFilter1, such that hits from SENSORGNOME stations with a freqSD > 0.1 will be considered ‘bad’ data (i.e., we’ll set the motusFilter value to 0 for these hits).
hits <- map(hits, \(x) {
mutate(x, motusFilter = if_else(recvType == "SENSORGNOME" & freqSD > 0.1, 0, motusFilter))
})Bad Runs3
Now we’ll calculate the proportion of good/bad data per tag, per receiver, per day.
- We’ll omit all runs on a day for this tag/receiver combo where less than half are ‘good’
noise_quality <- map(hits, \(x) {
noise <- x |>
select("date", "runID", "tagID", "tagDeployID", "recvDeployID", "motusFilter") |>
summarize(p_good = sum(motusFilter, na.rm = TRUE) / n(),
.by = c("tagID", "tagDeployID", "recvDeployID", "date")) |>
filter(p_good <= 0.5) |>
distinct()
semi_join(x, noise, by = c("tagID", "tagDeployID", "recvDeployID", "date")) |>
select("runID", "tagDeployID", "recvDeployID") |>
collect()
}) |> list_rbind()Ambiguous detections
Let’s collect all runs where there is some ambiguity. We’ll look at the allruns table for this.
ambig_ids <- map(dbs, \(x) {
t <- tbl(x, "allruns") |>
filter(!is.na(ambigID)) |>
select("runID", "tagID" = "motusTagID", "ambigID") |>
distinct() |>
collect() |>
mutate(is_ambig = TRUE)
if(nrow(t) == 0) t <- NULL
t
})|>
list_rbind(names_to = "proj_id") |>
mutate(proj_id = as.integer(proj_id))Now let’s see if any of these runs are even left in our data after filtering…
runs |>
anti_join(noise_tags) |>
anti_join(noise_quality) |>
semi_join(ambig_ids) |>
collect()# A tibble: 0 × 29
# ℹ 29 variables: runID <int>, tsBegin <dbl>, tsEnd <dbl>, done <int>,
# tagID <int>, ant <chr>, len <int>, nodeNum <chr>, motusFilter <dbl>,
# tagDeployID <int>, speciesID <int>, tsStartTag <dbl>, tsEndTag <dbl>,
# test <int>, batchID <int>, recvDeviceID <int>, recvDeployID <int>,
# tsStartRecv <dbl>, tsEndRecv <dbl>, recvType <chr>, recvDeployLat <dbl>,
# recvDeployLon <dbl>, timeBegin <dttm>, timeEnd <dttm>, dateBegin <date>,
# dateEnd <date>, monthBegin <dbl>, yearBegin <dbl>, proj_id <int>
There are no ambiguous runs left the data after we cleaned, so we’ll just ignore them for now.
Looking at the filters
noise_hits <- bind_rows(noise_tags, noise_quality) |>
select("runID", "tagDeployID", "recvDeployID") |>
distinct()
noise_hits# A tibble: 338,272 × 3
runID tagDeployID recvDeployID
<int> <int> <int>
1 587803069 41241 8385
2 587803894 41241 8385
3 607721978 41241 7952
4 607723494 41241 7952
5 491015464 41214 8415
6 491123194 41205 8415
7 629411593 52135 8415
8 631079364 44341 5417
9 640346863 52140 9006
10 640347023 52140 9006
# ℹ 338,262 more rows
Next we’ll take a look at how this compares to the motusFilter
With only the runs filtering
count(runs, proj_id, motusFilter) |>
collect() |>
pivot_wider(names_from = motusFilter, values_from = n) |>
arrange(proj_id)# A tibble: 11 × 3
proj_id `1` `0`
<int> <int> <int>
1 168 267523 148857
2 352 543816 212388
3 364 1617 14453
4 373 477672 292145
5 393 24927 49501
6 417 604562 354220
7 464 556 31519
8 484 601907 492931
9 515 29846 153426
10 551 625166 641404
11 607 4 427
With both the runs and hit filtering
anti_join(runs, noise_hits, by = c("runID", "tagDeployID", "recvDeployID")) |>
count(proj_id, motusFilter) |>
collect() |>
pivot_wider(names_from = motusFilter, values_from = n) |>
arrange(proj_id)# A tibble: 11 × 3
proj_id `1` `0`
<int> <int> <int>
1 168 267235 118672
2 352 542725 174567
3 364 1584 7799
4 373 476839 267387
5 393 24654 42336
6 417 603126 315663
7 464 472 16293
8 484 601017 483676
9 515 29753 149450
10 551 619025 489385
11 607 4 172
There are still many ‘bad’ data according to the motusFilter… but we are definitely getting closer.
Saving filters
We’ll save the ‘bad data’ for use in the next steps.
write_feather(noise_hits, sink = "Data/02_Datasets/noise_hits.feather")Wrap up
Disconnect from the databases
walk(dbs, dbDisconnect)