East Atlantic Coast Aquatic Invasive Species(AIS) Monitoring Program
In this notebook we repeat the analysis for different dataset to validate the approach.
Download and prepare the dataset¶
from erddapy import ERDDAP
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import panel as pn
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
pn.extension()Loading...
e = ERDDAP(
server = "https://erddap.ogsl.ca/erddap",
protocol = "tabledap"
)e.dataset_id = "mpoEaeTemperature"
e.variables = ['time', 'location', 'sea_water_temperature' ]
e.constraints = { "time>=": "2015-01-01", "time<=": "2020-12-31" }os.makedirs('data', exist_ok=True)
csvfile = f"data/ais_tempdata.csv.gz"
if not os.path.exists(csvfile):
print("Downloading...", end='')
df = e.to_pandas()
df.to_csv(csvfile, compression='gzip', index=False)
print("Done.")
else:
df = pd.read_csv(csvfile)
df = df.rename(columns={'time (UTC)': 'time',
'location (unitless)': 'location',
'sea_water_temperature (degree_C)':'sea_water_temperature'})
# Ensure the date column is in datetime format
df['time'] = pd.to_datetime(df['time'])df.sample(5)Loading...
# Create a new column for date only (drop time component)
df['date'] = df['time'].dt.date
# Group and aggregate
daily_avg = (
df
.groupby(['location', 'date'])['sea_water_temperature']
.mean()
.round(3) # Limit to 3 decimal places
.reset_index()
.rename(columns={'sea_water_temperature': 'daily_avg_temperature'})
)# Pivot the data: rows = date, columns = station, values = daily average temperature
pivot_df = daily_avg.pivot_table(
index='date',
columns=['location'],
values='daily_avg_temperature'
)pivot_df.to_csv('dataset_ais.csv')pivot_df.sample(10)Loading...
Explore the data¶
df = pd.read_csv('dataset_ais.csv', parse_dates=True, index_col=0)This dataset only has observations during the a portion of each calendar year (there are no measurements take during the winter).
def plot_all_sites(df):
image_data = df.astype('float32').T.values
x_labels = df.index.strftime('%Y-%m-%d') # dates → x-axis
y_labels = list(df.columns) # station-depths → y-axis
x_coords = np.arange(len(x_labels))
y_coords = np.arange(len(y_labels))
heatmap = hv.Image((x_coords, y_coords, image_data)).opts(
xaxis='bottom',
xlabel='Date',
ylabel='Station @ Depth',
xticks=list(zip(x_coords[::30], x_labels[::30])), # every 30th date
yticks=list(zip(y_coords, y_labels)),
xrotation=45,
cmap='Viridis',
colorbar=True,
width=1000,
height=800,
tools=['hover']
)
return heatmap
plot_all_sites(df)Loading...
Visualize the series data¶
# Create a dropdown selector
site_selector = pn.widgets.Select(name='Site', options=list(df.columns))
def highlight_nan_regions(label):
series = df[label]
# Identify NaN regions
is_nan = series.isna()
nan_ranges = []
current_start = None
for date, missing in is_nan.items():
if missing and current_start is None:
current_start = date
elif not missing and current_start is not None:
nan_ranges.append((current_start, date))
current_start = None
if current_start is not None:
nan_ranges.append((current_start, series.index[-1]))
# Create shaded regions
spans = [
hv.VSpan(start, end).opts(color='red', alpha=0.2)
for start, end in nan_ranges
]
curve = hv.Curve(series, label=label).opts(
width=900, height=250, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
show_grid=True, title=label
)
return curve * hv.Overlay(spans)
interactive_plot = hv.DynamicMap(pn.bind(highlight_nan_regions, site_selector))
pn.Column(site_selector, interactive_plot, 'Hightlights regions are gaps that need to imputed.')Loading...
Impute the gaps¶
We have determined that the MissForestappears to work reasonably well when imputing artificially large gaps.
We use it to gap fill the missing data in this dataset.
from imputeMF import imputeMFdf_imputed = pd.DataFrame(imputeMF(df.values, 10, print_stats=True), columns=df.columns, index=df.index)
df_imputed.to_csv('dataset_ais_imputed.csv', index=False)Statistics:
iteration 1, gamma = 0.013953594087730862
Statistics:
iteration 2, gamma = 0.0022450032743263804
Statistics:
iteration 3, gamma = 0.0010797411383667297
Statistics:
iteration 4, gamma = 0.0007319553571579978
Statistics:
iteration 5, gamma = 0.0004763537702074109
Statistics:
iteration 6, gamma = 0.00044923842634914823
Statistics:
iteration 7, gamma = 0.0003590004468683719
Statistics:
iteration 8, gamma = 0.0002876525373357935
Statistics:
iteration 9, gamma = 0.00026769759585397235
Statistics:
iteration 10, gamma = 0.00028814694283519905
def highlight_imputed_regions(label):
series = df[label]
series_imputed = df_imputed[label]
# Identify NaN regions
is_nan = series.isna()
nan_ranges = []
current_start = None
for date, missing in is_nan.items():
if missing and current_start is None:
current_start = date
elif not missing and current_start is not None:
nan_ranges.append((current_start, date))
current_start = None
if current_start is not None:
nan_ranges.append((current_start, series.index[-1]))
# Create shaded regions
spans = [
hv.VSpan(start, end).opts(color='red', alpha=0.2)
for start, end in nan_ranges
]
curve = hv.Curve(series_imputed, label=label).opts(
width=900, height=250, tools=['hover', 'box_zoom', 'pan', 'wheel_zoom'],
show_grid=True, title=label
)
return curve * hv.Overlay(spans)
interactive_plot = hv.DynamicMap(pn.bind(highlight_imputed_regions, site_selector))
pn.Column(site_selector, interactive_plot)Loading...
Highlighted regions show where the gaps have been imputed.
Notice the imputation algorithm gap fills in time intervals where there is very limited information from any other site. Care should be taken in interpretation of interpolated data.
plot_all_sites(df_imputed)Loading...