Skip to article frontmatterSkip to article content

CMAR Water Quality Datasets

Counties:

  • Annapolis County
  • Halifax County
  • Lunenburg County
  • Guysborough County
  • Colchester County
  • Pictou County
  • Shelburne County
  • Inverness County
  • Antigonish County
  • Yarmouth County
  • Digby County
  • Queens County
  • Richmond County
  • Victoria County
  • Cape Breton County
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm

Pull out the metadata on these sensor strings

counties = ['Annapolis', 'Halifax', 'Lunenburg', 'Guysborough', 'Colchester',
            'Pictou', 'Shelburne', 'Inverness', 'Antigonish', 'Yarmouth',
            'Digby', 'Queens', 'Richmond', 'Victoria']
metadata = []
for county in counties:
    print(county)
    filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
    df = pd.read_csv(filename, sep=',', dtype='str', usecols=[0,1,2,3,4,5,6,7,8,10,11])
    df = df.drop_duplicates()
    df = df.reset_index()
    df.insert(0, 'county', county)
    metadata.append(df)
Annapolis
Halifax
Lunenburg
Guysborough
Colchester
Pictou
Shelburne
Inverness
Antigonish
Yarmouth
Digby
Queens
Richmond
Victoria
Cape_Breton
df = pd.concat(metadata)
df
Loading...
df.to_csv('cmar_datasets.csv', index=True)
os.makedirs('data_ns', exist_ok=True)

for county in tqdm(counties):
    csvfile = f"data_ns/{county}.csv"

    if os.path.exists(csvfile):
        continue

    filename = os.path.join('../CMAR_WaterQualityDatasets', f'{county}_County_Water_Quality_Data_20250130.csv')
    df = pd.read_csv(filename, usecols=[0,1,5,9,10,16,21], parse_dates=['timestamp_utc'], date_format="%m/%d/%Y %I:%M:%S %p")
    
    # filter by date range
    df = df[(df.timestamp_utc >= '2020-09-01') & (df.timestamp_utc <= '2024-08-31')]
       
    df.to_csv(csvfile, index=False)
Loading...
!ls -lh data_ns/
total 2.1G
-rw-r--r-- 1 jmunroe jmunroe  28M Jun 11 22:51 Annapolis.csv
-rw-r--r-- 1 jmunroe jmunroe  90M Jun 11 22:52 Antigonish.csv
-rw-r--r-- 1 jmunroe jmunroe  126 Jun 11 22:52 Cape_Breton.csv
-rw-r--r-- 1 jmunroe jmunroe  27M Jun 11 22:52 Colchester.csv
-rw-r--r-- 1 jmunroe jmunroe 167M Jun 11 22:52 Digby.csv
-rw-r--r-- 1 jmunroe jmunroe 783M Jun 11 22:52 Guysborough.csv
-rw-r--r-- 1 jmunroe jmunroe 150M Jun 11 22:51 Halifax.csv
-rw-r--r-- 1 jmunroe jmunroe  34M Jun 11 22:52 Inverness.csv
-rw-r--r-- 1 jmunroe jmunroe 329M Jun 11 22:51 Lunenburg.csv
-rw-r--r-- 1 jmunroe jmunroe  40M Jun 11 22:52 Pictou.csv
-rw-r--r-- 1 jmunroe jmunroe  60M Jun 11 22:52 Queens.csv
-rw-r--r-- 1 jmunroe jmunroe  75M Jun 11 22:52 Richmond.csv
-rw-r--r-- 1 jmunroe jmunroe 119M Jun 11 22:52 Shelburne.csv
-rw-r--r-- 1 jmunroe jmunroe 150K Jun 11 22:52 Victoria.csv
-rw-r--r-- 1 jmunroe jmunroe 155M Jun 11 22:52 Yarmouth.csv
os.makedirs('segments_ns', exist_ok=True)

all_segment_metadata = []
for county in tqdm(counties):

    csvfile = f"data_ns/{county}.csv"

    df = pd.read_csv(csvfile)
    df.rename(columns={'timestamp_utc': 'time (UTC)', 
                       'sensor_depth_at_low_tide_m': 'depth (m)',
                       'temperature_degree_c': 'temperature (degrees_Celsius)',
                       'qc_flag_temperature_degree_c': 'qc_flag_temperature'
                      }, 
              inplace=True)
    
    df['segment'] = df[['waterbody', 'station', 'depth (m)',
                     'deployment_range',
                     ]].agg(lambda x: county + '_' + '_'.join([str(y) for y in x]), axis=1)

    df_metadata = df[['segment', 'waterbody', 'station', 'depth (m)',
                     'deployment_range',
                     ]]

    df_metadata = df_metadata.drop_duplicates()
    all_segment_metadata.append(df_metadata)
    
    df_data = df.drop(columns=['waterbody', 'station', 'depth (m)',
                                 'deployment_range',
                                  ])

    df_data = df_data.sort_values(by=['segment', 'time (UTC)'])
    
    df_data.set_index(['segment', 'time (UTC)'], inplace=True)

    for key, segment_df in df_data.groupby(level=0):
        csvfile = f'segments_ns/{key}.csv'
        segment_df = segment_df.droplevel(0)
        segment_df.to_csv(csvfile)

df_metadata = pd.concat(all_segment_metadata)
df_metadata.set_index('segment', inplace=True)
df_metadata.to_csv('metadata_ns.csv')
Loading...
!ls -lh segments_ns/ | wc
   1126   15215  135587
df_metadata
Loading...