Source code for ctdcal.oxy_fitting

"""
Module for processing oxygen from CTD and bottle samples.
"""

import csv
import logging
import xml.etree.cElementTree as ET
from collections import OrderedDict
from pathlib import Path

import gsw
import numpy as np
import pandas as pd
import scipy

from . import ctd_plots as ctd_plots
from . import flagging as flagging
from . import get_ctdcal_config
from . import process_ctd as process_ctd

cfg = get_ctdcal_config()
log = logging.getLogger(__name__)


[docs]def load_winkler_oxy(oxy_file): """ Load Winkler oxygen titration data file. Parameters ---------- oxy_file : str or Path Path to oxygen file Returns ------- df : DataFrame Oxygen data params : list of str List of oxygen parameters used in titration """ with open(oxy_file, newline="") as f: oxyF = csv.reader( f, delimiter=" ", quoting=csv.QUOTE_NONE, skipinitialspace="True" ) oxy_array = [] for row in oxyF: if len(row) > 9: row = row[:9] oxy_array.append(row) # TODO turn params into a dict with useful labels params = oxy_array.pop(0) # save file header info for later (Winkler values) cols = OrderedDict( [ ("STNNO_OXY", int), ("CASTNO_OXY", int), ("BOTTLENO_OXY", int), ("FLASKNO", int), ("TITR_VOL", float), ("TITR_TEMP", float), ("DRAW_TEMP", float), ("TITR_TIME", int), ("END_VOLTS", float), ] ) df = pd.DataFrame(oxy_array, columns=cols.keys()).astype(cols) df = df[df["BOTTLENO_OXY"] != 99] # remove "Dummy Data" df = df[df["TITR_VOL"] > 0] # remove "ABORTED DATA" df = df.sort_values("BOTTLENO_OXY").reset_index(drop=True) df["FLASKNO"] = df["FLASKNO"].astype(str) return df, params
[docs]def load_flasks(flask_file=cfg.dirs["oxygen"] + "o2flasks.vol", comment="#"): """ Load oxygen flask information from .vol file. Parameters ---------- flask_file : str or Path, optional Path to flask file comment : str, optional Identifier signifying line is a comment and should be skipped Returns ------- flasks : DataFrame Flask numbers and volumes """ with open(flask_file, "r") as f: flasks = [] for line in f: is_comment = line.strip().startswith(comment) if ("Volume" in line) or is_comment: continue num, vol = line.strip().split()[:2] # only need first two cols (#, volume) flasks.append([str(num), float(vol)]) flasks = pd.DataFrame(flasks, columns=["FLASKNO", "FLASK_VOL"]) return flasks
[docs]def correct_flask_vol(flask_vol, t=20.0, glass="borosilicate"): """ Correct flask volume for changes from thermal expansion of glass. Parameters ---------- flask_vol : array-like Flask volumes at standard temperature (20C) t : float, optional New temperature to calculate volume glass : str, optional Type of glass ("borosilicate" or "soft) Returns ------- corrected_vol : array-like Flask volumes are new temperature Notes ----- Flask volume equation from 2007 Best Practices for Ocean CO2 Measurements, SOP 13 - Gravimetric calibration of volume contained using water """ alpha = { # thermal expansion coefficient "borosilicate": 1.0e-5, "soft": 2.5e-3, } if glass not in alpha.keys(): raise KeyError(f"Glass type not found, must be one of {list(alpha.keys())}") standard_t = 20.0 corrected_vol = flask_vol * (1.0 + alpha[glass] * (t - standard_t)) return corrected_vol
[docs]def gather_oxy_params(oxy_file): """ Collect Winkler oxygen measurement parameters from LabVIEW data file headers. Parameters ---------- oxy_file : str or Path Path to oxygen file Returns ------- df : DataFrame Oxygen measurement parameters """ titr_columns = ["V_std", "V_blank", "N_KIO3", "V_KIO3", "T_KIO3", "T_thio"] try: with open(oxy_file, newline="") as f: header = f.readline() param_list = header.split()[:6] params = pd.DataFrame(param_list, dtype=float).transpose() params.columns = titr_columns except FileNotFoundError: # fitting data before titration file has been received from oxygen analyst log.info(f"Failed to load {oxy_file} titration file, filling with NaNs") params = pd.DataFrame(np.nan, index=[0], columns=titr_columns) return params
[docs]def calculate_bottle_oxygen(ssscc_list, ssscc_col, titr_vol, titr_temp, flask_nums): """ Wrapper function for collecting parameters and calculating oxygen values from Winkler titrations. Parameters ---------- ssscc_list : list of str List of stations to process ssscc_col : array-like Station/cast for each sample taken titr_vol : array-like Titration volume [mL] titr_temp : array-like Temperature of titration [degC] flask_nums : array-like Oxygen flask used for each sample Returns ------- oxy_mL_L : array-like Oxygen concentration [mL/L] Notes ----- Titration equation comes from WHP Operations and Methods, Culberson (1991): https://cchdo.github.io/hdo-assets/documentation/manuals/pdf/91_1/culber2.pdf """ params = pd.DataFrame() for ssscc in ssscc_list: df = gather_oxy_params(cfg.dirs["oxygen"] + ssscc) df["SSSCC"] = ssscc params = pd.concat([params, df]) # get flask volumes and merge with titration parameters flask_df = load_flasks() # TODO: volume correction from thermal expansion? volumes = pd.merge(flask_nums, flask_df, how="left")["FLASK_VOL"].values params = pd.merge(ssscc_col, params, how="left") # find 20degC equivalents rho_20C = gsw.rho_t_exact(0, 20, 0) rho_T_KIO3 = gsw.rho_t_exact(0, params["T_KIO3"], 0) N_KIO3_20C = params["N_KIO3"] * (rho_T_KIO3 / rho_20C) # TODO: does KIO3 volume get corrected? what is the recorded value? # V_KIO3_20C = correct_flask_vol(params["V_KIO3"], t=params["T_KIO3"]) # calculate O2 concentration (in mL/L) E = 5598 # stoichiometric relationship between thio_n and DO DO_reg = 0.0017 # correction for oxygen added by reagents V_reg = 2.0 # volume of reagents (mL) oxy_mL_L = ( ( ((titr_vol.values - params["V_blank"]) * params["V_KIO3"] * N_KIO3_20C * E) / (params["V_std"] - params["V_blank"]) - 1000 * DO_reg ) ) / (volumes - V_reg) return oxy_mL_L.values
[docs]def hysteresis_correction(oxygen, pressure, H1=-0.033, H2=5000, H3=1450, freq=24): """ Remove hysteresis effects from oxygen concentration values. Oxygen hysteresis can be corrected before conversion from volts to oxygen concentration, see equations_sbe.sbe43_hysteresis_voltage() # TODO: should this just be a wrapper that calls sbe43_hysteresis_voltage()? Parameters ---------- oxygen : array-like Oxygen concentration values pressure : array-like CTD pressure values (dbar) H1 : scalar, optional Amplitude of hysteresis correction function (range: -0.02 to -0.05) H2 : scalar, optional Function constant or curvature function for hysteresis H3 : scalar, optional Time constant for hysteresis (seconds) (range: 1200 to 2000) freq : scalar, optional CTD sampling frequency (Hz) Returns ------- oxy_corrected : array-like Hysteresis-corrected oxygen concentration values (with same units as input) Notes ----- See Application Note 64-3 for more information. """ # TODO: vectorize (if possible), will probably require matrix inversion dt = 1 / freq D = 1 + H1 * (np.exp(pressure / H2) - 1) C = np.exp(-1 * dt / H3) oxy_corrected = np.zeros(oxygen.shape) oxy_corrected[0] = oxygen[0] for i in np.arange(1, len(oxygen)): oxy_corrected[i] = ( oxygen[i] + (oxy_corrected[i - 1] * C * D[i]) - (oxygen[i - 1] * C) ) / D[i] return oxy_corrected
[docs]def oxy_ml_to_umolkg(oxy_mL_L, sigma0): """Convert dissolved oxygen from units of mL/L to micromol/kg. Parameters ---------- oxy_mL_L : array-like Dissolved oxygen in units of [mL/L] sigma0 : array-like Potential density anomaly (i.e. sigma - 1000) referenced to 0 dbar [kg/m^3] Returns ------- oxy_umol_kg : array-like Dissolved oxygen in units of [umol/kg] Notes ----- Conversion value 44660 is exact for oxygen gas and derived from the ideal gas law. (c.f. Sea-Bird Application Note 64, pg. 6) """ oxy_umol_kg = oxy_mL_L * 44660 / (sigma0 + 1000) return oxy_umol_kg
[docs]def oxy_umolkg_to_ml(oxy_umol_kg, sigma0): """Convert dissolved oxygen from units of micromol/kg to mL/L. Parameters ---------- oxy_umol_kg : array-like Dissolved oxygen in units of [umol/kg] sigma0 : array-like Potential density anomaly (i.e. sigma - 1000) referenced to 0 dbar [kg/m^3] Returns ------- oxy_mL_L : array-like Dissolved oxygen in units of [mL/L] Notes ----- Conversion value 44660 is exact for oxygen gas and derived from the ideal gas law. (c.f. Sea-Bird Application Note 64, pg. 6) """ oxy_mL_L = oxy_umol_kg * (sigma0 + 1000) / 44660 return oxy_mL_L
[docs]def calculate_dV_dt(oxy_volts, time, nan_replace=True): """ Calculate the time derivative of oxygen voltage. Parameters ---------- oxy_volts : array-like Oxygen sensor voltage output time : array-like Time from oxygen sensor (must be same length as oxy_volts) nan_replace : bool, optional Replace nans in time derivative with the mean value Returns ------- dV_dt : array-like Time derivative of oxygen voltage """ # TODO: experiment with dt, filtering # Uchida (2008): dV/dt "estimated by linear fits over 2 second intervals" # should dt just be 1 / freq? i.e. 1/24 Hz dV = np.diff(oxy_volts) # central differences shorten vectors by 1 dt = np.diff(time) dt[dt == 0] = np.median(dt[dt > 0]) # replace with median to avoid dividing by zero dV_dt = dV / dt dV_dt = np.insert(dV_dt, 0, 0) # add zero in front to match original length dV_dt[np.isinf(dV_dt)] = np.nan # this check is probably unnecessary if nan_replace: dV_dt = np.nan_to_num(dV_dt, nan=np.nanmean(dV_dt)) # TODO: should we do some kind of filtering? e.g.: # (PMEL does this calculation on binned data already so filtering is not the same) # a = 1 # windowsize = 5 # b = (1 / windowsize) * np.ones(windowsize) # filtered_dvdt = scipy.signal.filtfilt(b, a, dv_dt) return dV_dt # filtered_dvdt
def _get_sbe_coef(idx=0): """ Get SBE oxygen coefficients from raw .xmlcon files. Defaults to using first station in ssscc.csv file. Returns the following tuple of coefficients: Soc, offset, Tau20, Tcor, E """ # TODO: does scipy's minimize function needs a tuple? can this be improved further? station = process_ctd.get_ssscc_list()[idx] xmlfile = cfg.dirs["raw"] + station + ".XMLCON" tree = ET.parse(xmlfile) root_eq0 = tree.find(".//CalibrationCoefficients[@equation='0']") # Owens-Millard root_eq1 = tree.find(".//CalibrationCoefficients[@equation='1']") # SBE equation coefs = {c.tag: float(c.text) for c in root_eq1} coefs["Tcor"] = float(root_eq0.find("Tcor").text) # only coef needed from eq0 keep_keys = ["Soc", "offset", "Tau20", "Tcor", "E"] return tuple(coefs[key] for key in keep_keys)
[docs]def calculate_weights(pressure): """ Calculate weights (as a function of pressure) for weighted least squares fitting. Deep measurements are weighted higher than shallow. Parameters ---------- presssure : array-like Pressure values of oxygen measurements [dbar] Returns ------- weights : array-like Weight factor for each pressure value """ # TODO: automatic weight calculation rather than hardcoded (machine learning?) epsilon = 1e-5 # small offset to avoid interpolation issues # define piecewise weight function dependent on pressure p_bins = [ 0, 100, 100 + epsilon, 300, 300 + epsilon, 500, 500 + epsilon, 1200, 1200 + epsilon, 2000, 2000 + epsilon, 7000, ] w_bins = [20, 20, 25, 25, 50, 50, 100, 100, 200, 200, 500, 500] wgt = scipy.interpolate.interp1d(p_bins, w_bins) weights = wgt(pressure) # get weights from piecewise function return weights
"""code_pruning: should this be here or in equations_sbe? somewhere else?""" def _PMEL_oxy_eq(coefs, inputs, cc=[1.92634e-4, -4.64803e-2]): """ Modified oxygen equation for SBE 43 used by NOAA/PMEL coef[0] = Soc coef[1] = Voffset coef[2] = Tau20 coef[3] = Tcorr coef[4] = E """ Soc, Voff, Tau20, Tcorr, E = coefs oxyvolts, pressure, temp, dvdt, os = inputs o2 = ( Soc * ( oxyvolts + Voff + Tau20 * np.exp(cc[0] * pressure + cc[1] * (temp - 20)) * dvdt ) * os * np.exp(Tcorr * temp) * np.exp((E * pressure) / (temp + 273.15)) ) return o2
[docs]def PMEL_oxy_weighted_residual(coefs, weights, inputs, refoxy, L_norm=2): # TODO: optionally include other residual types # (abstracted from PMEL code oxygen_cal_ml.m) # unweighted L2: sum((ref - oxy)^2) # if weighted fails # unweighted L4: sum((ref - oxy)^4) # unsure of use case # unweighted L1: sum(abs(ref - oxy)) # very far from ideal # anything else? genericize with integer "norm" function input? residuals = np.sum( (weights * (refoxy - _PMEL_oxy_eq(coefs, inputs)) ** 2) ) / np.sum(weights ** 2) return residuals
[docs]def match_sigmas( btl_prs, btl_oxy, btl_tmp, btl_SA, ctd_os, ctd_prs, ctd_tmp, ctd_SA, ctd_oxyvolts, ctd_time, ): # Construct Dataframe from bottle and ctd values for merging btl_data = pd.DataFrame( data={"CTDPRS": btl_prs, "REFOXY": btl_oxy, "CTDTMP": btl_tmp, "SA": btl_SA} ) time_data = pd.DataFrame( data={ "CTDPRS": ctd_prs, "OS": ctd_os, "CTDTMP": ctd_tmp, "SA": ctd_SA, "CTDOXYVOLTS": ctd_oxyvolts, "CTDTIME": ctd_time, } ) time_data["dv_dt"] = calculate_dV_dt(time_data["CTDOXYVOLTS"], time_data["CTDTIME"]) # Merge DF merged_df = pd.DataFrame( columns=["CTDPRS", "CTDOXYVOLTS", "CTDTMP", "dv_dt", "OS"], dtype=float ) merged_df["REFOXY"] = btl_data["REFOXY"].copy() # calculate sigma referenced to multiple depths for idx, p_ref in enumerate([0, 1000, 2000, 3000, 4000, 5000, 6000]): # pandas 1.2.1 ufunc issue workaround btl_inputs = np.broadcast_arrays( btl_data["SA"], btl_data["CTDTMP"], btl_data["CTDPRS"], p_ref ) time_inputs = np.broadcast_arrays( time_data["SA"], time_data["CTDTMP"], time_data["CTDPRS"], p_ref ) btl_data[f"sigma{idx}"] = ( gsw.pot_rho_t_exact(*btl_inputs) - 1000 # subtract 1000 to get potential density *anomaly* ) + 1e-8 * np.random.standard_normal(btl_data["SA"].size) time_data[f"sigma{idx}"] = ( gsw.pot_rho_t_exact(*time_inputs) - 1000 # subtract 1000 to get potential density *anomaly* ) + 1e-8 * np.random.standard_normal(time_data["SA"].size) rows = (btl_data["CTDPRS"] > (p_ref - 500)) & ( btl_data["CTDPRS"] < (p_ref + 500) ) time_sigma_sorted = time_data[f"sigma{idx}"].sort_values().to_numpy() sigma_min = np.min( [np.min(btl_data.loc[rows, f"sigma{idx}"]), np.min(time_sigma_sorted)] ) sigma_max = np.max( [np.max(btl_data.loc[rows, f"sigma{idx}"]), np.max(time_sigma_sorted)] ) time_sigma_sorted = np.insert(time_sigma_sorted, 0, sigma_min - 1e-4) time_sigma_sorted = np.append(time_sigma_sorted, sigma_max + 1e-4) # TODO: can this be vectorized? cols = ["CTDPRS", "CTDOXYVOLTS", "CTDTMP", "dv_dt", "OS"] inds = np.concatenate(([0], np.arange(0, len(time_data)), [len(time_data) - 1])) for col in cols: merged_df.loc[rows, col] = np.interp( btl_data.loc[rows, f"sigma{idx}"], time_sigma_sorted, time_data[col].iloc[inds], ) # Apply coef and calculate CTDOXY sbe_coef0 = _get_sbe_coef() # initial coefficient guess merged_df["CTDOXY"] = _PMEL_oxy_eq( sbe_coef0, ( merged_df["CTDOXYVOLTS"], merged_df["CTDPRS"], merged_df["CTDTMP"], merged_df["dv_dt"], merged_df["OS"], ), ) return merged_df
[docs]def sbe43_oxy_fit(merged_df, sbe_coef0=None, f_suffix=None): # Plot data to be fit together f_out = f"{cfg.fig_dirs['ox']}sbe43_residual{f_suffix}_prefit.pdf" ctd_plots._intermediate_residual_plot( merged_df["REFOXY"] - merged_df["CTDOXY"], merged_df["CTDPRS"], merged_df["SSSCC"], xlabel="CTDOXY Residual (umol/kg)", f_out=f_out, xlim=(-10, 10), ) bad_df = pd.DataFrame() # initialize DF for questionable values if sbe_coef0 is None: sbe_coef0 = _get_sbe_coef() # load initial coefficient guess # Curve fit (weighted) weights = calculate_weights(merged_df["CTDPRS"]) fit_vars = ["CTDOXYVOLTS", "CTDPRS", "CTDTMP", "dv_dt", "OS"] fit_data = tuple(merged_df[v] for v in fit_vars) res = scipy.optimize.minimize( PMEL_oxy_weighted_residual, x0=sbe_coef0, args=(weights, fit_data, merged_df["REFOXY"]), bounds=[(None, None), (None, None), (0, None), (None, None), (None, None)], ) cfw_coefs = res.x merged_df["CTDOXY"] = _PMEL_oxy_eq(cfw_coefs, fit_data) merged_df["residual"] = merged_df["REFOXY"] - merged_df["CTDOXY"] cutoff = 2.8 * np.std(merged_df["residual"]) thrown_values = merged_df[np.abs(merged_df["residual"]) > cutoff] bad_df = pd.concat([bad_df, thrown_values]) merged_df = merged_df[np.abs(merged_df["residual"]) <= cutoff].copy() while not thrown_values.empty: # runs as long as there are thrown_values p0 = tuple(cfw_coefs) # initialize coefficients with previous results weights = calculate_weights(merged_df["CTDPRS"]) fit_data = tuple(merged_df[v] for v in fit_vars) # merged_df changes each loop res = scipy.optimize.minimize( PMEL_oxy_weighted_residual, x0=p0, args=(weights, fit_data, merged_df["REFOXY"]), bounds=[(None, None), (None, None), (0, None), (None, None), (None, None)], ) cfw_coefs = res.x merged_df["CTDOXY"] = _PMEL_oxy_eq(cfw_coefs, fit_data) merged_df["residual"] = merged_df["REFOXY"] - merged_df["CTDOXY"] cutoff = 2.8 * np.std(merged_df["residual"]) thrown_values = merged_df[np.abs(merged_df["residual"]) > cutoff] # TODO: get some kind of logging in here in case things go awry # e.g. count of thrown values, start/final stdev, failing to converge, etc. bad_df = pd.concat([bad_df, thrown_values]) merged_df = merged_df[np.abs(merged_df["residual"]) <= cutoff].copy() # intermediate plots to diagnose data chunks goodness # TODO: implement into bokeh/flask dashboard if f_suffix is not None: f_out = f"{cfg.fig_dirs['ox']}sbe43_residual{f_suffix}.pdf" ctd_plots._intermediate_residual_plot( merged_df["residual"], merged_df["CTDPRS"], merged_df["SSSCC"], xlabel="CTDOXY Residual (umol/kg)", f_out=f_out, xlim=(-10, 10), ) merged_df["CTDOXY_FLAG_W"] = 2 bad_df["CTDOXY_FLAG_W"] = 3 df = pd.concat([merged_df, bad_df]) return cfw_coefs, df
[docs]def prepare_oxy(btl_df, time_df, ssscc_list): """ Calculate oxygen-related variables needed for calibration: sigma, oxygen solubility (OS), and bottle oxygen Parameters ---------- btl_df : DataFrame CTD data at bottle stops time_df : DataFrame Continuous CTD data ssscc_list : list of str List of stations to process Returns ------- """ # Calculate SA and CT btl_df["SA"] = gsw.SA_from_SP( btl_df[cfg.column["sal"]], btl_df[cfg.column["p"]], btl_df[cfg.column["lon"]], btl_df[cfg.column["lat"]], ) btl_df["CT"] = gsw.CT_from_t( btl_df["SA"], btl_df[cfg.column["t1"]], # oxygen sensor is on primary line (ie t1) btl_df[cfg.column["p"]], ) time_df["SA"] = gsw.SA_from_SP( time_df[cfg.column["sal"]], time_df[cfg.column["p"]], time_df[cfg.column["lon"]], time_df[cfg.column["lat"]], ) time_df["CT"] = gsw.CT_from_t( time_df["SA"], time_df[cfg.column["t1"]], # oxygen sensor is on primary line (ie t1) time_df[cfg.column["p"]], ) # calculate sigma btl_df["sigma_btl"] = gsw.sigma0(btl_df["SA"], btl_df["CT"]) time_df["sigma_btl"] = gsw.sigma0(time_df["SA"], time_df["CT"]) # Calculate oxygen solubility in µmol/kg btl_df["OS"] = gsw.O2sol( btl_df["SA"], btl_df["CT"], btl_df[cfg.column["p"]], btl_df[cfg.column["lon"]], btl_df[cfg.column["lat"]], ) time_df["OS"] = gsw.O2sol( time_df["SA"], time_df["CT"], time_df[cfg.column["p"]], time_df[cfg.column["lon"]], time_df[cfg.column["lat"]], ) # Convert CTDOXY units btl_df["CTDOXY"] = oxy_ml_to_umolkg(btl_df["CTDOXY1"], btl_df["sigma_btl"]) # Calculate bottle oxygen btl_df[cfg.column["refO"]] = calculate_bottle_oxygen( ssscc_list, btl_df["SSSCC"], btl_df["TITR_VOL"], btl_df["TITR_TEMP"], btl_df["FLASKNO"], ) btl_df[cfg.column["refO"]] = oxy_ml_to_umolkg( btl_df[cfg.column["refO"]], btl_df["sigma_btl"] ) btl_df["OXYGEN_FLAG_W"] = flagging.nan_values(btl_df[cfg.column["refO"]]) # Load manual OXYGEN flags if Path("data/oxygen/manual_oxy_flags.csv").exists(): manual_flags = pd.read_csv( "data/oxygen/manual_oxy_flags.csv", dtype={"SSSCC": str} ) for _, flags in manual_flags.iterrows(): df_row = (btl_df["SSSCC"] == flags["SSSCC"]) & ( btl_df["btl_fire_num"] == flags["SAMPNO"] ) btl_df.loc[df_row, "OXYGEN_FLAG_W"] = flags["Flag"] return True
[docs]def calibrate_oxy(btl_df, time_df, ssscc_list): """ Non-linear least squares fit chemical sensor oxygen against bottle oxygen. Parameters ---------- btl_df : DataFrame CTD data at bottle stops time_df : DataFrame Continuous CTD data ssscc_list : list of str List of stations to process Returns ------- """ log.info("Calibrating oxygen (SBE43)") # Plot all pre fit data f_out = f"{cfg.fig_dirs['ox']}sbe43_residual_all_prefit.pdf" ctd_plots._intermediate_residual_plot( btl_df["OXYGEN"] - btl_df["CTDOXY"], btl_df["CTDPRS"], btl_df["SSSCC"], xlabel="CTDOXY Residual (umol/kg)", f_out=f_out, xlim=(-10, 10), ) # Prep vars, dfs, etc. all_sbe43_merged = pd.DataFrame() sbe43_dict = {} all_sbe43_fit = pd.DataFrame() btl_df["dv_dt"] = np.nan # initialize column # Density match time/btl oxy dataframes for ssscc in ssscc_list: time_data = time_df[time_df["SSSCC"] == ssscc].copy() btl_data = btl_df[btl_df["SSSCC"] == ssscc].copy() # can't calibrate without bottle oxygen ("OXYGEN") if (btl_data["OXYGEN_FLAG_W"] == 9).all(): sbe43_dict[ssscc] = np.full(5, np.nan) log.warning(ssscc + " skipped, all oxy data is NaN") continue sbe43_merged = match_sigmas( btl_data[cfg.column["p"]], btl_data[cfg.column["refO"]], btl_data["CTDTMP1"], btl_data["SA"], time_data["OS"], time_data[cfg.column["p"]], time_data[cfg.column["t1"]], time_data["SA"], time_data[cfg.column["oxyvolts"]], time_data["scan_datetime"], ) sbe43_merged = sbe43_merged.reindex(btl_data.index) # add nan rows back in btl_df.loc[ btl_df["SSSCC"] == ssscc, ["CTDOXYVOLTS", "dv_dt", "OS"] ] = sbe43_merged[["CTDOXYVOLTS", "dv_dt", "OS"]] sbe43_merged["SSSCC"] = ssscc all_sbe43_merged = pd.concat([all_sbe43_merged, sbe43_merged]) log.info(ssscc + " density matching done") # Only fit using OXYGEN flagged good (2) all_sbe43_merged = all_sbe43_merged[btl_df["OXYGEN_FLAG_W"] == 2].copy() # Fit ALL oxygen stations together to get initial coefficient guess (sbe_coef0, _) = sbe43_oxy_fit(all_sbe43_merged, f_suffix="_ox0") sbe43_dict["ox0"] = sbe_coef0 # Fit each cast individually for ssscc in ssscc_list: sbe_coef, sbe_df = sbe43_oxy_fit( all_sbe43_merged.loc[all_sbe43_merged["SSSCC"] == ssscc].copy(), sbe_coef0=sbe_coef0, f_suffix=f"_{ssscc}", ) # build coef dictionary if ssscc not in sbe43_dict.keys(): # don't overwrite NaN'd stations sbe43_dict[ssscc] = sbe_coef # all non-NaN oxygen data with flags all_sbe43_fit = pd.concat([all_sbe43_fit, sbe_df]) # TODO: save outlier data from fits? # TODO: secondary oxygen flagging step (instead of just taking outliers from fit routine) # apply coefs time_df["CTDOXY"] = np.nan for ssscc in ssscc_list: if np.isnan(sbe43_dict[ssscc]).all(): log.warning( f"{ssscc} missing oxy data, leaving nan values and flagging as 9" ) time_df.loc[time_df["SSSCC"] == ssscc, "CTDOXY_FLAG_W"] = 9 continue btl_rows = (btl_df["SSSCC"] == ssscc).values time_rows = (time_df["SSSCC"] == ssscc).values btl_df.loc[btl_rows, "CTDOXY"] = _PMEL_oxy_eq( sbe43_dict[ssscc], ( btl_df.loc[btl_rows, cfg.column["oxyvolts"]], btl_df.loc[btl_rows, cfg.column["p"]], btl_df.loc[btl_rows, cfg.column["t1"]], btl_df.loc[btl_rows, "dv_dt"], btl_df.loc[btl_rows, "OS"], ), ) log.info(ssscc + " btl data fitting done") time_df.loc[time_rows, "CTDOXY"] = _PMEL_oxy_eq( sbe43_dict[ssscc], ( time_df.loc[time_rows, cfg.column["oxyvolts"]], time_df.loc[time_rows, cfg.column["p"]], time_df.loc[time_rows, cfg.column["t1"]], time_df.loc[time_rows, "dv_dt"], time_df.loc[time_rows, "OS"], ), ) log.info(ssscc + " time data fitting done") # flag CTDOXY with more than 1% difference time_df["CTDOXY_FLAG_W"] = 2 # TODO: actual flagging of some kind? btl_df["CTDOXY_FLAG_W"] = flagging.by_percent_diff( btl_df["CTDOXY"], btl_df["OXYGEN"], percent_thresh=1 ) # Plot all post fit data f_out = f"{cfg.fig_dirs['ox']}sbe43_residual_all_postfit.pdf" ctd_plots._intermediate_residual_plot( btl_df["OXYGEN"] - btl_df["CTDOXY"], btl_df["CTDPRS"], btl_df["SSSCC"], xlabel="CTDOXY Residual (umol/kg)", f_out=f_out, xlim=(-10, 10), ) f_out = f"{cfg.fig_dirs['ox']}sbe43_residual_all_postfit_flag2.pdf" flag2 = btl_df["CTDOXY_FLAG_W"] == 2 ctd_plots._intermediate_residual_plot( btl_df.loc[flag2, "OXYGEN"] - btl_df.loc[flag2, "CTDOXY"], btl_df.loc[flag2, "CTDPRS"], btl_df.loc[flag2, "SSSCC"], xlabel="CTDOXY Residual (umol/kg)", f_out=f_out, xlim=(-10, 10), ) # export fitting coefs sbe43_coefs = pd.DataFrame.from_dict( sbe43_dict, orient="index", columns=["Soc", "Voffset", "Tau20", "Tcorr", "E"] ).applymap(lambda x: np.format_float_scientific(x, precision=4, exp_digits=1)) sbe43_coefs.to_csv(cfg.dirs["logs"] + "sbe43_coefs.csv") return True