Download from https://
Counties:
- Annapolis County
- Halifax County
- Lunenburg County
- Guysborough County
- Colchester County
- Pictou County
- Shelburne County
- Inverness County
- Antigonish County
- Yarmouth County
- Digby County
- Queens County
- Richmond County
- Victoria County
- Cape Breton County
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
Pull out the metadata on these sensor strings
counties = ['Annapolis', 'Halifax', 'Lunenburg', 'Guysborough', 'Colchester',
'Pictou', 'Shelburne', 'Inverness', 'Antigonish', 'Yarmouth',
'Digby', 'Queens', 'Richmond', 'Victoria']
metadata = []
for county in counties:
print(county)
filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
df = pd.read_csv(filename, sep=',', dtype='str', usecols=[0,1,2,3,4,5,6,7,8,10,11])
df = df.drop_duplicates()
df = df.reset_index()
df.insert(0, 'county', county)
metadata.append(df)
Annapolis
Halifax
Lunenburg
Guysborough
Colchester
Pictou
Shelburne
Inverness
Antigonish
Yarmouth
Digby
Queens
Richmond
Victoria
Cape_Breton
df = pd.concat(metadata)
df
Loading...
df.to_csv('cmar_datasets.csv', index=True)
os.makedirs('data_ns', exist_ok=True)
for county in tqdm(counties):
csvfile = f"data_ns/{county}.csv"
if os.path.exists(csvfile):
continue
filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
df = pd.read_csv(filename, usecols=[0,1,5,9,10,16,21], parse_dates=['timestamp_utc'], date_format="%m/%d/%Y %I:%M:%S %p")
# filter by date range
df = df[(df.timestamp_utc >= '2020-09-01') & (df.timestamp_utc <= '2024-08-31')]
df.to_csv(csvfile, index=False)
Loading...
!ls -lh data_ns/
total 2.1G
-rw-r--r-- 1 jmunroe jmunroe 28M Jun 11 22:51 Annapolis.csv
-rw-r--r-- 1 jmunroe jmunroe 90M Jun 11 22:52 Antigonish.csv
-rw-r--r-- 1 jmunroe jmunroe 126 Jun 11 22:52 Cape_Breton.csv
-rw-r--r-- 1 jmunroe jmunroe 27M Jun 11 22:52 Colchester.csv
-rw-r--r-- 1 jmunroe jmunroe 167M Jun 11 22:52 Digby.csv
-rw-r--r-- 1 jmunroe jmunroe 783M Jun 11 22:52 Guysborough.csv
-rw-r--r-- 1 jmunroe jmunroe 150M Jun 11 22:51 Halifax.csv
-rw-r--r-- 1 jmunroe jmunroe 34M Jun 11 22:52 Inverness.csv
-rw-r--r-- 1 jmunroe jmunroe 329M Jun 11 22:51 Lunenburg.csv
-rw-r--r-- 1 jmunroe jmunroe 40M Jun 11 22:52 Pictou.csv
-rw-r--r-- 1 jmunroe jmunroe 60M Jun 11 22:52 Queens.csv
-rw-r--r-- 1 jmunroe jmunroe 75M Jun 11 22:52 Richmond.csv
-rw-r--r-- 1 jmunroe jmunroe 119M Jun 11 22:52 Shelburne.csv
-rw-r--r-- 1 jmunroe jmunroe 150K Jun 11 22:52 Victoria.csv
-rw-r--r-- 1 jmunroe jmunroe 155M Jun 11 22:52 Yarmouth.csv
os.makedirs('segments_ns', exist_ok=True)
all_segment_metadata = []
for county in tqdm(counties):
csvfile = f"data_ns/{county}.csv"
df = pd.read_csv(csvfile)
df.rename(columns={'timestamp_utc': 'time (UTC)',
'sensor_depth_at_low_tide_m': 'depth (m)',
'temperature_degree_c': 'temperature (degrees_Celsius)',
'qc_flag_temperature_degree_c': 'qc_flag_temperature'
},
inplace=True)
df['segment'] = df[['waterbody', 'station', 'depth (m)',
'deployment_range',
]].agg(lambda x: county + '_' + '_'.join([str(y) for y in x]), axis=1)
df_metadata = df[['segment', 'waterbody', 'station', 'depth (m)',
'deployment_range',
]]
df_metadata = df_metadata.drop_duplicates()
all_segment_metadata.append(df_metadata)
df_data = df.drop(columns=['waterbody', 'station', 'depth (m)',
'deployment_range',
])
df_data = df_data.sort_values(by=['segment', 'time (UTC)'])
df_data.set_index(['segment', 'time (UTC)'], inplace=True)
for key, segment_df in df_data.groupby(level=0):
csvfile = f'segments_ns/{key}.csv'
segment_df = segment_df.droplevel(0)
segment_df.to_csv(csvfile)
df_metadata = pd.concat(all_segment_metadata)
df_metadata.set_index('segment', inplace=True)
df_metadata.to_csv('metadata_ns.csv')
Loading...
!ls -lh segments_ns/ | wc
1126 15215 135587
df_metadata
Loading...