import pandas as pd
import os
import csv
import io
import numpy as np
segment_id = 0
filename = f'temperature_degree_c/{segment_id}_temperature_degree_c.csv'
df = pd.read_csv(filename, parse_dates=['timestamp_utc'], index_col='timestamp_utc', date_format="%m/%d/%Y %I:%M:%S %p")
# todo: figure out why the data is always sorted
df = df.sort_index()
def test_methods(segment_id, percentage=10):
segment_id = 0
filename = f'temperature_degree_c/{segment_id}_temperature_degree_c.csv'
df = pd.read_csv(filename, parse_dates=['timestamp_utc'], index_col='timestamp_utc', date_format="%m/%d/%Y %I:%M:%S %p")
# todo: figure out why the data is always sorted
df = df.sort_index()
num_rows = len(df)
num_missing = int(num_rows * percentage / 100)
# Ensure at least one row is set as missing
num_missing = max(1, num_missing)
# Randomly select a starting index for the contiguous block
start_index = np.random.randint(0, num_rows - num_missing + 1)
df['missing'] = df.iloc[:,0]
# Set the contiguous block of rows as missing (NaN)
df.iloc[start_index:start_index + num_missing, df.columns.get_loc('missing')] = np.nan
results = {}
df['filled'] = df['missing'].ffill()
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Last Observation Carried Forward'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].fillna(df['missing'].mean())
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Mean Value'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='linear')
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Linear Interpolation'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='nearest')
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Nearest Neighbour'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='polynomial', order=2)
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Polynomial Interpolation (K=2)'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='polynomial', order=3)
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Polynomial Interpolation (K=3)'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='spline', order=2)
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Spline Interpolation (K=2)'] = {'MAE': MAE, 'RMSE': RMSE}
df['filled'] = df['missing'].interpolate(method='spline', order=3)
df['error'] = df['temperature_degree_c'] - df['filled']
MAE = np.mean(abs(df['error']))
RMSE = np.sqrt(np.mean((df['error'])**2))
results['Spline Interpolation (K=3)'] = {'MAE': MAE, 'RMSE': RMSE}
return results
Results¶
segment_id = 5
rows = []
for i in range(20):
results = test_methods(segment_id, 5)
for method in results:
MAE = results[method]['MAE']
RMSE = results[method]['RMSE']
row = (method, MAE, RMSE)
rows.append(row)
results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE')
Loading...
rows = []
for i in range(20):
results = test_methods(segment_id, 10)
for method in results:
MAE = results[method]['MAE']
RMSE = results[method]['RMSE']
row = (method, MAE, RMSE)
rows.append(row)
results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE')
Loading...
rows = []
for i in range(20):
results = test_methods(segment_id, 20)
for method in results:
MAE = results[method]['MAE']
RMSE = results[method]['RMSE']
row = (method, MAE, RMSE)
rows.append(row)
results = pd.DataFrame(rows, columns = ['method', 'MAE', 'RMSE'])
results.groupby('method').mean().sort_values(by='MAE')
Loading...