Skip to article frontmatterSkip to article content

Imputation

import pandas as pd
import os
import csv
import io
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
from matplotlib.dates import DateFormatter

import sklearn.impute
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from missforest import MissForest

Imputation

Next steps include implementing goodness-of-fit (GoF) indicators ( coefficient of determination, percent bias, Kling-Gupta efficiency, MAE, RMSE, ) to evaluate the performance of the imputation algorithm. An experiment should be replicated many times to present the GoF as a distribution. Then explore how that GoF changes with the amount of missing data (gap sizes) and tuning of hyperparameters. These GoF indicators can then also be compared to other imputation algorithms to determine if MissForest (or any other more computational involved algorithm) is actually “better” than simple univariate methods.

The goal for the next two weeks is to set up the framework and run the experiments:

  • Measurement of GoF indicators
  • Multiple replicates of each experiment
  • As a function of continuous gap size
  • As a function of number of randomly removed samples
  • Compare to ‘simple’ methods such as replace with mean or median or linear interpolation

Goodness-of-fit indicators

Coefficient of determination (R2R^2)

R2=[i=1n(Oiμ0)(Siμs)i=1n(Oiμ0)2i=1n(Osμs)2]R^2 = \left [ \frac{\sum_{i=1}^{n}(O_i - \mu_0)(S_i - \mu_s)}{\sqrt{\sum_{i=1}^{n}(O_i - \mu_0)^2}\sqrt{\sum_{i=1}^{n}(O_s - \mu_s)^2}} \right ]

Percent bias (PBIAS)

PBIAS=[i=1nSiOoi=1nOi]×100PBIAS = \left [ \frac{\sum_{i=1}^{n} S_i - O_o}{\sum_{i=1}^{n}O_i} \right ] \times 100

Kling-Gupta efficiency (KGE)

KGE=1(r1)2+(β1)2+(γ1)2KGE = 1 - \sqrt{(r-1)^2 + (\beta -1)^2 + (\gamma -1)^2}
β=μsμo\beta = \frac{\mu_s}{\mu_o}
γ=σs/μsσo/μo\gamma = \frac{\sigma_s/\mu_s}{\sigma_o/\mu_o}
variable = 'temperature_degrees'
df = pd.read_csv('data.csv', parse_dates=True, index_col=0)

# Define the cutoff date
cutoff_date = "2020-09-01"

# Select rows past the cutoff date
df = df[df.index >=  pd.Timestamp(cutoff_date)]

# Define the cutoff date
cutoff_date = "2024-08-31"

# Select rows past the cutoff date
df = df[df.index <=  pd.Timestamp(cutoff_date)]

# Calculate the percentage of non-missing data for each study site
non_missing_percentage = df.notna().mean() * 100

# Filter study sites with at least 50% non-missing data
selected_sites = non_missing_percentage[non_missing_percentage >=99].index
df_selected = df[selected_sites]
def plot_data_available(df):
    # Creating a boolean DataFrame where True indicates non-missing data
    non_missing_data = ~df.isna()
    
    # Plotting
    plt.figure(figsize=(12, 8))
    sns.heatmap(non_missing_data.T, xticklabels= 180, cmap=['white', 'black'], cbar=False)
    
    plt.xlabel('Time')
    plt.ylabel('Study Site ID')
plot_data_available(df_selected)
<Figure size 1200x800 with 1 Axes>

Dataset consists of 31 “stations” with 1461 observations each.

Remove contiguous segments

At each station remove contiguous segments (having lengths 7, 14, 21, 30 days)

df = df_selected.copy()

# Function to randomly set a n-day contiguous segment as missing for each column
n = 180
for col in df.columns:
    # Randomly select the start of the n-day segment
    start_idx = np.random.randint(0, len(df) - n)
    end_idx = start_idx + n

    # Set the values in this range to NaN
    df.iloc[start_idx:end_idx, df.columns.get_loc(col)] = np.nan
plot_data_available(df)
<Figure size 1200x800 with 1 Axes>
def run_experiment(method = 'MF', gap_type = 'continuous', n = 1):
    
    variable = 'temperature_degrees'
    df = pd.read_csv('data.csv', parse_dates=True, index_col=0)
    
    # Define the cutoff date
    cutoff_date = "2020-06-01"
    
    # Select rows past the cutoff date
    df = df[df.index >=  pd.Timestamp(cutoff_date)]
    
    # Define the cutoff date
    cutoff_date = "2024-06-01"
    
    df = df[df.index <  pd.Timestamp(cutoff_date)]
    
    # Calculate the percentage of non-missing data for each study site
    non_missing_percentage = df.notna().mean() * 100
    
    # Filter study sites with at least 50% non-missing data
    selected_sites = non_missing_percentage[non_missing_percentage >= 99].index
    df_selected = df[selected_sites]

    # artifical gaps 
    df = df_selected.copy()


    if gap_type == 'continuous':
        # randomly set a n-day contiguous segment as missing for each column
        for col in df.columns:
            # Randomly select the start of the n-day segment
            start_idx = np.random.randint(0, len(df) - n)
            end_idx = start_idx + n
        
            # Set the values in this range to NaN
            df.iloc[start_idx:end_idx, df.columns.get_loc(col)] = np.nan
    elif gap_type == 'random':

        # remove randomly selected n days as missing for each column
        for col in df.columns:
            # Randomly sample unique dates
            random_dates = np.random.choice(df.index, size=n, replace=False)
            
            # Set the values to NaN for those dates
            df.loc[random_dates, col] = np.nan
    else:
        raise Exception(f'Unknown gap_type {gap_type}')

    if method == 'MF':
        # Imputer
        clf = RandomForestClassifier(n_jobs=-1)
        rgr = RandomForestRegressor(n_jobs=-1, n_estimators=5)
    
        imputer = MissForest(clf, rgr, verbose=1,max_iter=5)
        df_sim = imputer.fit_transform(df)
    
    elif method == 'linear':
        df_sim = df.interpolate(method='linear')
    else:
        raise Exception(f'Unknown method {method}')
          
    
    suffix = f'_{gap_type}_{n}_{method}'
    # save output
    df_selected.to_csv(f'obs{suffix}.csv')
    df_imputed.to_csv(f'sim{suffix}.csv')
    
method = 'linear'
gap_type = 'continuous'
for n in [7, 14, 21, 30, 60, 180, 365]:
    run_experiment(gap_type=gap_type, n=n, method=method)

gap_type = 'random'
for n in [30, 60, 90, 120, 180, 365]:
    run_experiment(gap_type=gap_type, n=n, method=method)
station_id = '28'
plt.figure(figsize=(16,4))
plt.plot(df_imputed[station_id], label='Simulated')
plt.plot(df_selected[station_id], label='Observed')

plt.plot(abs(df_selected[station_id] - df_imputed[station_id])+20, label='Error')

plt.legend()
<Figure size 1600x400 with 1 Axes>

Calculate GoFs

S = df_imputed
O = df_selected

μo = O.mean()
σo = O.std()

μs = S.mean()
σs = S.std()

# bias ratio
β = μs / μo

# variability ratio
γ = (σs/μs) / (σo/μo)

## correlation coefficient
r = df_selected.corrwith(df_imputed)
R2 = ((O - μo) * (S - μs)).sum() / (np.sqrt(((O-μo)**2).sum()) * np.sqrt(((S-μs)**2).sum()))

PBIAS = (S - O).sum() / O.sum() * 100

KGE = 1 - np.sqrt((r - 1)**2 + (β - 1)**2 + (γ - 1)**2)
def KGE(S, O):
    μo = O.mean()
    σo = O.std()

    μs = S.mean()
    σs = S.std()

    # bias ratio
    β = μs / μo

    # variability ratio
    γ = (σs/μs) / (σo/μo)

    ## correlation coefficient
    r = O.corrwith(S)

    return  1 - np.sqrt((r - 1)**2 + (β - 1)**2 + (γ - 1)**2)
fig, ax = plt.subplots(1, 3, figsize=(12, 4))

ax[0].boxplot(R2)
ax[0].set_title('$R^2$')

ax[1].boxplot(PBIAS)
ax[1].set_title('PBIAS (%)')

ax[2].boxplot(KGE)
ax[2].set_title('KGE')
<Figure size 1200x400 with 3 Axes>
def gof(O, S):
        
    μo = O.mean()
    σo = O.std()
    
    μs = S.mean()
    σs = S.std()
    
    # bias ratio
    β = μs / μo
    
    # variability ratio
    γ = (σs/μs) / (σo/μo)
    
    ## correlation coefficient
    r = O.corrwith(S)
    
    R2 = ((O - μo) * (S - μs)).sum() / (np.sqrt(((O-μo)**2).sum()) * np.sqrt(((S-μs)**2).sum()))
    
    PBIAS = (S - O).sum() / O.sum() * 100
    
    KGE = 1 - np.sqrt((r - 1)**2 + (β - 1)**2 + (γ - 1)**2)

    return pd.DataFrame({'R2': R2, 'PBIAS': PBIAS, 'KGE': KGE})
performance = {}

gap_type = 'continuous'
performance[gap_type] = { 'R2': {}, 'PBIAS': {}, 'KGE': {} }
for n in [7, 14, 21, 30, 60, 180, 365]:
    for method in ['', '_linear']:
        suffix = f'_{gap_type}_{n}{method}'
    
        S = pd.read_csv(f'sim{suffix}.csv', index_col=0)
        O = pd.read_csv(f'obs{suffix}.csv', index_col=0)

        results = gof(S, O)

        if method == '':
            method = 'MF'
        elif method == '_linear':
            method = 'LR'
            
        position = f'{n} {method}'
        performance[gap_type]['R2'][position] = results['R2']
        performance[gap_type]['PBIAS'][position] = results['PBIAS']
        performance[gap_type]['KGE'][position] = results['KGE']

        
gap_type = 'random'
performance[gap_type] = { 'R2': {}, 'PBIAS': {}, 'KGE': {} }
for n in [30, 60, 90, 120, 180, 365]:
    for method in ['', '_linear']:
        suffix = f'_{gap_type}_{n}{method}'
    
        S = pd.read_csv(f'sim{suffix}.csv', index_col=0)
        O = pd.read_csv(f'obs{suffix}.csv', index_col=0)

        results = gof(S, O)

        if method == '':
            method = 'MF'
        elif method == '_linear':
            method = 'LR'
            
        position = f'{n} {method}'

        performance[gap_type]['R2'][position] = results['R2']
        performance[gap_type]['PBIAS'][position] = results['PBIAS']
        performance[gap_type]['KGE'][position] = results['KGE']

for gap_type in ['continuous', 'random']:
    for metric in ['R2', 'PBIAS', 'KGE']:
        performance[gap_type][metric] = pd.DataFrame(performance[gap_type][metric])
performance['continuous']['R2']
Loading...
fig, ax = plt.subplots(3, 2, figsize=(12, 16))

for i, gap_type in enumerate(['continuous', 'random']):
    labels = list(performance[gap_type]['R2'].columns)
    
    ax[0,i].boxplot(performance[gap_type]['R2'])
    ax[0,i].set_xticklabels(labels, rotation=90)
    ax[0,i].set_title('$R^2$')

    ax[1,i].boxplot(performance[gap_type]['PBIAS'])
    ax[1,i].set_xticklabels(labels, rotation=90)
    ax[1,i].set_title('PBIAS (%)')

    ax[2,i].boxplot(performance[gap_type]['KGE'])
    ax[2,i].set_xticklabels(labels, rotation=90)
    ax[2,i].set_title('KGE')
    
plt.show()
<Figure size 1200x1600 with 6 Axes>
results = gof(S, O)
results.boxplot()
<Axes: >
<Figure size 640x480 with 1 Axes>

plt.boxplot(gof(S,O), positions)
<Figure size 640x480 with 1 Axes>
fig, ax = plt.subplots(3, 2, figsize=(12, 16))

ax[0].boxplot(R2)
ax[0].set_title('$R^2$')

ax[1].boxplot(PBIAS)
ax[1].set_title('PBIAS (%)')

ax[2].boxplot(KGE)
ax[2].set_title('KGE')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[270], line 3
      1 fig, ax = plt.subplots(3, 2, figsize=(12, 16))
----> 3 ax[0].boxplot(R2)
      4 ax[0].set_title('$R^2$')
      6 ax[1].boxplot(PBIAS)

AttributeError: 'numpy.ndarray' object has no attribute 'boxplot'
<Figure size 1200x1600 with 6 Axes>