source("XX_setup.R")
<- open_dataset("Data/02_Datasets/noise_runs.feather", format = "feather")
noise_runs
<- open_dataset("Data/02_Datasets/runs", format = "feather") |>
runs anti_join(noise_runs)
<- load_hits() |>
hits map(\(x) anti_join(x, noise_runs, by = c("runID", "recvDeployID", "tagDeployID")))
Fine-scale Filtering
Here we perform fine-scale filtering which involves more assessments of potential issues.
Many of these steps rely on hit-level data, as opposed to run-level data from the previous steps.
Setup
We’ll also use a modified version of the motusFilter
1, such that hits from SENSORGNOME stations with a freqSD > 0.1
will be considered ‘bad’ data (i.e., we’ll set the motusFilter
value to 0 for these hits).
<- map(hits, \(x) {
hits mutate(x, motusFilter = if_else(recvType == "SENSORGNOME" & freqSD > 0.1, 0, motusFilter))
})
Bad Runs3
Now we’ll calculate the proportion of good/bad data per tag, per receiver, per day.
- We’ll omit all runs on a day for this tag/receiver combo where less than half are ‘good’
<- map(hits, \(x) {
noise_quality <- x |>
noise select("date", "runID", "tagID", "tagDeployID", "recvDeployID", "motusFilter") |>
summarize(p_good = sum(motusFilter, na.rm = TRUE) / n(),
.by = c("tagID", "tagDeployID", "recvDeployID", "date")) |>
filter(p_good <= 0.5) |>
distinct()
semi_join(x, noise, by = c("tagID", "tagDeployID", "recvDeployID", "date")) |>
select("runID", "tagDeployID", "recvDeployID") |>
collect()
|> list_rbind() })
Ambiguous detections
Let’s collect all runs where there is some ambiguity. We’ll look at the allruns
table for this.
<- map(dbs, \(x) {
ambig_ids <- tbl(x, "allruns") |>
t filter(!is.na(ambigID)) |>
select("runID", "tagID" = "motusTagID", "ambigID") |>
distinct() |>
collect() |>
mutate(is_ambig = TRUE)
if(nrow(t) == 0) t <- NULL
t|>
})list_rbind(names_to = "proj_id") |>
mutate(proj_id = as.integer(proj_id))
Now let’s see if any of these runs are even left in our data after filtering…
|>
runs anti_join(noise_tags) |>
anti_join(noise_quality) |>
semi_join(ambig_ids) |>
collect()
# A tibble: 0 × 29
# ℹ 29 variables: runID <int>, tsBegin <dbl>, tsEnd <dbl>, done <int>,
# tagID <int>, ant <chr>, len <int>, nodeNum <chr>, motusFilter <dbl>,
# tagDeployID <int>, speciesID <int>, tsStartTag <dbl>, tsEndTag <dbl>,
# test <int>, batchID <int>, recvDeviceID <int>, recvDeployID <int>,
# tsStartRecv <dbl>, tsEndRecv <dbl>, recvType <chr>, recvDeployLat <dbl>,
# recvDeployLon <dbl>, timeBegin <dttm>, timeEnd <dttm>, dateBegin <date>,
# dateEnd <date>, monthBegin <dbl>, yearBegin <dbl>, proj_id <int>
There are no ambiguous runs left the data after we cleaned, so we’ll just ignore them for now.
Looking at the filters
<- bind_rows(noise_tags, noise_quality) |>
noise_hits select("runID", "tagDeployID", "recvDeployID") |>
distinct()
noise_hits
# A tibble: 338,272 × 3
runID tagDeployID recvDeployID
<int> <int> <int>
1 587803069 41241 8385
2 587803894 41241 8385
3 607721978 41241 7952
4 607723494 41241 7952
5 491015464 41214 8415
6 491123194 41205 8415
7 629411593 52135 8415
8 631079364 44341 5417
9 640346863 52140 9006
10 640347023 52140 9006
# ℹ 338,262 more rows
Next we’ll take a look at how this compares to the motusFilter
With only the runs filtering
count(runs, proj_id, motusFilter) |>
collect() |>
pivot_wider(names_from = motusFilter, values_from = n) |>
arrange(proj_id)
# A tibble: 11 × 3
proj_id `1` `0`
<int> <int> <int>
1 168 267523 148857
2 352 543816 212388
3 364 1617 14453
4 373 477672 292145
5 393 24927 49501
6 417 604562 354220
7 464 556 31519
8 484 601907 492931
9 515 29846 153426
10 551 625166 641404
11 607 4 427
With both the runs and hit filtering
anti_join(runs, noise_hits, by = c("runID", "tagDeployID", "recvDeployID")) |>
count(proj_id, motusFilter) |>
collect() |>
pivot_wider(names_from = motusFilter, values_from = n) |>
arrange(proj_id)
# A tibble: 11 × 3
proj_id `1` `0`
<int> <int> <int>
1 168 267235 118672
2 352 542725 174567
3 364 1584 7799
4 373 476839 267387
5 393 24654 42336
6 417 603126 315663
7 464 472 16293
8 484 601017 483676
9 515 29753 149450
10 551 619025 489385
11 607 4 172
There are still many ‘bad’ data according to the motusFilter
… but we are definitely getting closer.
Saving filters
We’ll save the ‘bad data’ for use in the next steps.
write_feather(noise_hits, sink = "Data/02_Datasets/noise_hits.feather")
Wrap up
Disconnect from the databases
walk(dbs, dbDisconnect)