Wildlife-Movement-Analysis-CLI/config.yaml at main · piyarshah/Wildlife-Movement-Analysis-CLI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# ============================================================
# Movement Analysis CLI  v2.1.0 — config.yaml
# Run with : python run.py config.yaml
# Validate : python run.py config.yaml --dry-run
#
# Tip: make one config file per dataset.
#   eg. python run.py config_caribou.yaml
#   eg. python run.py config_elephant.yaml
# Outputs go to separate directories automatically via dataset_name.
# ============================================================


# ── IDENTITY ─────────────────────────────────────────────────
# dataset_name is a label for logging and metadata only.
# It does NOT affect where outputs go unless you put {dataset_name}
# in output_dir yourself.
dataset_name: #name

# output_dir: where all outputs are saved.
# Default is a single flat "outputs" folder next to run.py.
# To separate runs by species, use: outputs/{dataset_name}
output_dir: outputs


# ── REQUIRED: Input data ─────────────────────────────────────
data_path: #absolute path

timestamp_col: timestamp
id_col:        individual-local-identifier
lon_col:       location-long
lat_col:       location-lat


# ── CRS ──────────────────────────────────────────────────────
# input_crs: CRS of your raw GPS data.
#   Most GPS data is WGS84 geographic: EPSG:4326 (default).
#   If already projected, set here — pipeline converts automatically.
# metric_crs: projected CRS for all distance and area calculations.
#   UTM 34S (EPSG:32734) = southern Africa.
#   Find your zone at https://epsg.io
input_crs:  EPSG:4326
metric_crs: #metric_crs in same format as above


# ── OPTIONAL: Environmental layers ───────────────────────────
# Comment out or remove to skip all water/distance analysis. Must be in the same folder.
water_path: data/water.geojson


# ── Output ───────────────────────────────────────────────────
save_plots:  true
save_tables: true


# ── Focus individual ─────────────────────────────────────────
# Used only for the run log (which individual is highlighted).
# Figs 1–4 are now produced for ALL individuals automatically.
# Defaults to the first individual found if not set.
# focus_individual: "1001"


# ── TEMPORAL RESAMPLING (change 2) ────────────────────────────
# Resample fixes to a regular interval before ALL analysis.
# Essential when comparing datasets with different fix rates
# (e.g. caribou every 2 h, elephant every 4 h).
# Set to 0 or comment out to disable.
#
# Recommended values:
#   High-frequency collar (< 30 min): 30
#   Standard collar (1–2 h):          60
#   Low-frequency collar (> 2 h):     leave disabled
resample_interval_minutes: 0


# ── PLOT SUBSAMPLING (change 3) ───────────────────────────────
# For trajectory-line plots ONLY (fig 1).
# Every N-th point is drawn — analysis always uses full data.
# Reduces clutter and speeds up rendering for dense datasets.
# Set to 1 to plot every fix.
plot_trajectory_subsample: 50


# ── KDE parameters ───────────────────────────────────────────
kde_bandwidth:  0.3     # scipy bw_method; lower = tighter contours
kde_ud_level:   0.95    # utilisation distribution level (0–1)
kde_min_fixes:  200     # minimum fixes per season for seasonal KDE
kde_grid_size:  200     # grid resolution (cells per axis)
kde_max_pts:    5000    # max points used to fit KDE (downsampled if larger)


# ── CLUSTERING (changes 1, 6, 7) ─────────────────────────────

# --- HDBSCAN (change 7) ---
# HDBSCAN builds a density hierarchy and adapts to variable cluster shapes.
# More robust than DBSCAN across species with different movement scales.
# Requires: pip install hdbscan
# Set use_hdbscan: true to enable. Falls back to DBSCAN if not installed.
use_hdbscan:            false
hdbscan_min_cluster_size: 50   # minimum fixes to form a cluster
hdbscan_min_samples:    ~      # null = same as hdbscan_min_cluster_size

# --- DBSCAN eps calibration (change 1) ---
# dbscan_eps_method controls how eps is chosen per individual:
#
#   "knn"       (DEFAULT) eps derived from the k-nearest-neighbour distance
#               distribution at dbscan_knn_percentile. Measures the actual
#               point-cloud density of each individual's fix pattern — fully
#               data-driven and independent of step length or fix rate.
#               Works correctly for every species and spatial scale.
#               Use this unless you have a specific reason not to.
#
#   "adaptive"  eps = median_step_length * dbscan_k
#               Kept for back-compatibility. Only use if you specifically
#               want eps to scale with the animal's step length distribution.
#
dbscan_eps_method:      knn
dbscan_knn_percentile:  90.0   # p90 works well across species; raise if too many clusters, lower if too few
dbscan_k:               7.0    # only used when eps_method=adaptive

dbscan_min_samples:      20
dbscan_min_cluster_size: 100

# Clamp eps to this range (metres).
# Widened defaults accommodate everything from small mammals to elephants.
dbscan_eps_min:   10.0
dbscan_eps_max:   50000.0


# ── GMM behaviour model ─────────────────────────────────────
gmm_k_list:       [2, 3]   # BIC selects best K
gmm_random_state: 42


# ── Data quality thresholds ─────────────────────────────────
min_fixes_per_individual: 100
speed_outlier_percentile: 99.9   # set to 100.0 to disable


# ── Entropy threshold ────────────────────────────────────────
# Fixes above this percentile of GMM entropy are excluded from
# behaviour–space analysis. 75 = top 25% most uncertain excluded.
entropy_percentile_threshold: 75.0


# ── Seasonal definition ─────────────────────────────────────
wet_months: [] #update months


# ── Reproducibility ─────────────────────────────────────────
random_seed: 42


# ── Parallelism ──────────────────────────────────────────────
n_jobs: 1   # set -1 to use all CPU cores