Source code for ctdcal.process_ctd

import logging
import warnings
from datetime import datetime, timezone
from pathlib import Path

import gsw
import numpy as np
import pandas as pd
import scipy.signal as sig

from . import get_ctdcal_config, io, oxy_fitting

cfg = get_ctdcal_config()
log = logging.getLogger(__name__)

warnings.filterwarnings("ignore", "Mean of empty slice.")


[docs]def cast_details(df, ssscc, log_file=None):
    """
    We determine the cast details using pandas magic.
    First find alternating periods of pumps on and pumps off, then select the
    pumps on period with the highest pressure. Get values from the row with the
    highest pressure, and return all values to be sent to log.

    Parameters
    ----------
    df : DataFrame
        Filtered CTD data
    ssscc : integer
        The station and cast, as SSSCC format
    log_file : file handle or string
        File destination for cast details

    Returns
    -------
    df_downcast : DataFrame
        CTD data with the soak period and upcast trimmed off

    Notes
    -----
    The following (float) variables are output to log_file:
    time_start : Time at start of cast (in unix epoch time)
    time_end : Time at end of cast (in unix epoch time)
    time_bottom : Time at bottom of cast (in unix epoch time)
    p_start : Pressure at which cast started
    p_max : Bottom of the cast pressure
    b_lat : Latitude at bottom of cast
    b_lon : Longitude at bottom of cast
    b_alt : Altimeter reading at bottom of cast
    """
    df_cast = _trim_soak_period(df)

    # TODO: call parameters from config file instead
    p_start = float(np.around(df_cast["CTDPRS"].head(1), 4))
    p_max_ind = df_cast["CTDPRS"].argmax()
    p_max = float(np.around(df_cast["CTDPRS"].max(), 4))
    time_start = float(df_cast["scan_datetime"].head(1))
    time_end = float(df_cast["scan_datetime"].tail(1))
    time_bottom = float(df_cast["scan_datetime"][p_max_ind])
    b_lat = float(np.around(df_cast["GPSLAT"][p_max_ind], 4))
    b_lon = float(np.around(df_cast["GPSLON"][p_max_ind], 4))
    b_alt = float(np.around(df_cast["ALT"][p_max_ind], 4))

    io.write_cast_details(
        ssscc,
        log_file,
        time_start,
        time_end,
        time_bottom,
        p_start,
        p_max,
        b_alt,
        b_lat,
        b_lon,
    )

    # remove upcast
    df_downcast = df_cast[:p_max_ind].copy()

    return df_downcast


def _trim_soak_period(df=None):
    """
    1) Find pump on/off patterns
    2) Select pump_on=True group with largest pressure recording
    3) Find soak period before start of downcast
    4) Trim cast, return everything after top of cast (i.e. minimum pressure)
    """
    df_list = [
        g for i, g in df.groupby(df["pump_on"].ne(df["pump_on"].shift()).cumsum())
    ]
    df_pump_on_list = [df for df in df_list if df["pump_on"].all()]
    df_cast = df_pump_on_list[np.argmax([df["CTDPRS"].max() for df in df_pump_on_list])]
    df_cast = df_cast.reset_index(drop=True)
    # next fn deals w/ edge cases, leave as is for now
    df_cast = _find_last_soak_period(df_cast)
    start_ind = df_cast.loc[: len(df) // 4, "CTDPRS"].argmin()
    df_trimmed = df_cast[start_ind:].reset_index(drop=True).copy()

    return df_trimmed


def _find_last_soak_period(df_cast, time_bin=8, P_surface=2, P_downcast=50):
    """
    Find the soak period before the downcast starts.

    The algorithm is tuned for repeat hydrography work, specifically US GO-SHIP
    parameters. This assumes the soak depth will be somewhere between 10 and 30
    meters, the package will sit at the soak depth for at least 20 to 30 seconds
    before starting ascent to the surface and descent to target depth.

    The algorithm is not guaranteed to catch the exact start of the soak period,
    but within a minimum period of time_bin seconds(?) from end of the soak if
    the soak period assumption is valid. This should be shorter than the total
    soak period time, and able to catch the following rise and descent of the
    package that signals the start of the cast.

    The algorithm has been designed to handle four general cases of casts:
        * A routine cast with pumps turning on in water and normal soak
        * A cast where the pumps turn on in air/on deck
        * A cast where the pumps turn on and off due to rosette coming out of water
        * A cast where there are multiple stops on the downcast to the target depth

    Parameters
    ----------
    df_cast : DataFrame
        DataFrame of the entire cast, from deckbox on to deckbox off
    time_bin : integer, optional
        Number of seconds to bin average for descent rate calculation
    P_surface : integer, optional
        Minimum surface pressure threshold required to look for soak depth
        (2 dbar was chosen as an average rosette is roughly 1.5 to 2 meters tall)
    P_downcast : integer, optional
        Minimum pressure threshold required to assume downcast has started
        (50 dbar has been chosen as double the deep soak depth of 20-30 dbar)

    Returns
    -------
    df_cast_trimmed : DataFrame
        DataFrame starting within time_bin seconds of the last soak period.
    """
    # Validate user input
    if time_bin <= 0:
        raise ValueError("Time bin value should be positive whole seconds.")
    if P_downcast <= 0:
        raise ValueError(
            "Starting downcast pressure threshold must be positive integers."
        )
    if P_downcast < P_surface:
        raise ValueError(
            "Starting downcast pressure threshold must be greater \
                        than surface pressure threshold."
        )

    # If pumps have not turned on until in water, return DataFrame
    if df_cast.iloc[0]["CTDPRS"] > P_surface:
        return df_cast

    # Bin the data by time, and compute the average rate of descent
    df_cast["index"] = df_cast.index  # needed at end to identify start_idx
    df_cast["bin"] = pd.cut(
        df_cast.index,
        np.arange(df_cast.index[0], df_cast.index[-1], time_bin * 24),
        labels=False,
        include_lowest=True,
    )
    df_binned = df_cast.groupby("bin").mean()

    # Compute difference of descent rates and label bins
    df_binned["dP"] = df_binned["CTDPRS"].diff().fillna(0).round(0)
    df_binned["movement"] = pd.cut(
        df_binned["dP"], [-1000, -0.5, 0.5, 1000], labels=["up", "stop", "down"]
    )

    # Find all periods where the rosette is not moving
    df_group = df_binned.groupby(
        df_binned["movement"].ne(df_binned["movement"].shift()).cumsum()
    )
    df_list = [g for i, g in df_group]

    # Find last soak period before starting descent to target depth
    def find_last(df_list, P_downcast):
        for idx, df in enumerate(df_list):
            if df["CTDPRS"].max() < P_downcast:
                # make sure it's soak, not a stop to switch to autocast (i.e. A20 2021)
                # TODO: try instead finding max depth then working backwards?
                if df.max()["movement"] == "stop" and len(df) > 1:
                    last_idx = idx
            else:
                return last_idx
        return last_idx

    # Trim off everything before last soak
    start_idx = int(df_list[find_last(df_list, P_downcast)].head(1)["index"])
    df_cast_trimmed = df_cast.loc[start_idx:].reset_index()

    return df_cast_trimmed


[docs]def ctd_align(inMat=None, col=None, time=0.0):
    """ctd_align function

    Function takes full NUMPY ndarray with predefined dtype array
    and adjusts time of sensor responce and water flow relative to
    the time frame of temperature sensor.

    Originally written by Courtney Schatzman, docstring by Joseph Gum.
    Need to generate alignment plots in order to properly use ctd_align.

    Args:
        param1 (ndarray): inMat, numpy ndarray with dtype array
        param2 (float): col, column to apply time advance to.
        param3 (float): time, advance in seconds to apply to raw data.

    Returns:
        Narray: The return value is ndarray with adjusted time of parameter
          specified.

    """
    # Num of frames per second.
    fl = 24

    if (inMat is not None) & (col is not None) & (time > 0.0):
        # Time to advance
        advnc = int(fl * time)
        tmp = np.arange(advnc, dtype=np.float)
        last = inMat[col][len(inMat) - 1]
        tmp.fill(float(last))
        inMat[col] = np.concatenate((inMat[col][advnc:], tmp))

    return inMat


[docs]def raw_ctd_filter(df=None, window="triangle", win_size=24, parameters=None):
    """
    Filter raw CTD data using one of three window types (boxcar, hanning, triangle).

    Parameters
    ----------
    df : DataFrame
        Raw CTD data
    window : str, optional
        Type of filter window
    win_size : int, optional
        Length of window in number of samples
    parameters : list of str, optional
        List of DataFrame columns to be filtered

    Returns
    -------
    filtered_df : DataFrame
        CTD data with filtered parameters
    """

    filter_df = df.copy()
    if parameters is not None:
        for p in parameters:
            if window == "boxcar":
                win = sig.boxcar(win_size)
            elif window == "hanning":
                win = sig.hann(win_size)
            elif window == "triangle":
                win = sig.triang(win_size)
            filter_df[p] = sig.convolve(filter_df[p], win, mode="same") / np.sum(win)

    return filter_df


[docs]def remove_on_deck(df, stacast, cond_startup=20.0, log_file=None):
    """
    Find and remove times when rosette is on deck.
    Optionally log average pressure at start and end of cast.

    Parameters
    ----------
    df : DataFrame
        Raw CTD data
    stacast : str
        Station/cast name
    cond_startup : float, optional
        Minimum conductivity (units?) threshold indicating rosette is in water
    log_file : str, optional
        Path and filename to save start/end deck pressure values

    Returns
    -------
    trimmed_df : DataFrame
        Raw CTD data trimmed to times when rosette is in water
    """
    # TODO: move these to config file
    # Frequency
    fl = 24
    fl2 = fl * 2
    # Half minute
    ms = 30
    time_delay = fl * ms  # time to let CTD pressure reading settle/sit on deck

    # split dataframe into upcast/downcast
    downcast = df.iloc[: (df["CTDPRS"].argmax() + 1)]
    upcast = df.iloc[(df["CTDPRS"].argmax() + 1) :]

    # Search each half of df for minimum conductivity
    # threshold to identify when rosette is out of water
    start_df = downcast.loc[
        (downcast[cfg.column["c1"]] < cond_startup)
        & (downcast[cfg.column["c2"]] < cond_startup),
        cfg.column["p"],
    ]
    end_df = upcast.loc[
        (upcast[cfg.column["c1"]] < cond_startup)
        & (upcast[cfg.column["c2"]] < cond_startup),
        cfg.column["p"],
    ]

    # Evaluate starting and ending pressures
    start_samples = len(start_df)
    if start_samples > time_delay:
        start_p = np.average(start_df.iloc[fl2 : (start_samples - time_delay)])
    else:
        start_seconds = start_samples / fl
        log.warning(
            f"{stacast}: Only {start_seconds:0.1f} seconds of start pressure averaged."
        )
        start_p = np.average(start_df.iloc[fl2:start_samples])

    end_samples = len(end_df)
    if end_samples > time_delay:
        end_p = np.average(end_df.iloc[(time_delay):])
    else:
        end_seconds = end_samples / fl
        log.warning(
            f"{stacast}: Only {end_seconds:0.1f} seconds of end pressure averaged."
        )
        end_p = np.average(end_df)  # just average whatever there is

    # Remove ondeck start and end pressures
    if len(start_df) == 0:
        log.warning("Failed to find starting deck pressure.")
        for n in [1, 2]:
            try:
                (downcast[cfg.column[f"c{n}"]] < cond_startup).value_counts()[True]
            except KeyError:
                log.warning(
                    f"No values below {cond_startup} found for {cfg.column[f'c{n}']}"
                )
        breakpoint()
    if len(end_df) == 0:
        log.warning("Failed to find ending deck pressure.")
        for n in [1, 2]:
            try:
                (upcast[cfg.column[f"c{n}"]] < cond_startup).value_counts()[True]
            except KeyError:
                log.warning(
                    f"No values below {cond_startup} found for {cfg.column[f'c{n}']}"
                )
        breakpoint()
    # MK (3/23/20, 11am):
    # auto end calculation failed bc cond2 is still >30
    # may have to do manually or just use cond1 for station 00901
    trimmed_df = df.iloc[start_df.index.max() : end_df.index.min()].copy()

    # Log ondeck pressures
    if log_file is not None:
        io.write_pressure_details(stacast, log_file, start_p, end_p)

    return trimmed_df


[docs]def roll_filter(df, p_col="CTDPRS", direction="down"):
    """
    Filter out heaving in CTD data due to ship rolls.

    Parameters
    ----------
    df : DataFrame
        CTD data
    p_col : str, optional
        Name of pressure column
    direction : str, optional
        Direction of cast (i.e. "down" or "up" cast)

    Returns
    -------
    var2 : dtype
        var description

    """
    if direction == "down":
        monotonic_sequence = df[p_col].expanding().max()
    elif direction == "up":
        monotonic_sequence = df[p_col].expanding().min()
    else:
        raise ValueError("direction must be one of (up, down)")

    return df[df[p_col] == monotonic_sequence]


[docs]def pressure_sequence(df, p_col="CTDPRS", direction="down"):
    """
    Convert CTD time series to a pressure series.

    Parameters
    ----------
    df : DataFrame
        CTD time series data
    p_col : str, optional
        Name of pressure column
    direction : str, optional
        Direction to sequence data

    Returns
    -------
    df_binned : DataFrame
        Pressure binned CTD data
    """
    # TODO: optional start/end pressure values?

    # change to take dataframe with the following properties
    # * in water data only (no need to find cast start/end)
    # * The full down and up time series (not already split since this method will do it)
    # New "algorithm" (TODO spell this right)
    # * if direction is "down", use the input as is
    # * if direction is "up", invert the row order of the input dataframe
    # Use the "roll filter" method to get only the rows to be binned
    # * the roll filter will treat the "up" part of the cast as a giant roll to be filtered out
    # * the reversed dataframe will ensure we get the "up" or "down" part of the cast
    # * there is no need to reverse the dataframe again as the pressure binning process will remove any "order" information (it doesn't care about the order)
    # That's basically all I (barna) have so far TODO Binning, etc...
    # pandas.cut() to do binning

    df_filtered = roll_filter(df, p_col, direction=direction)

    # 04/11/21 MK: this is not behaving properly or the order is wrong?
    # Currently this function fills the top-most good CTD value *before* bin avg,
    # so those bin averages are not the same as the first good binned value.
    # df_filled = _fill_surface_data(df_filtered, bin_size=2)

    df_binned = binning_df(df_filtered, bin_size=2)  # TODO: abstract bin_size in config
    fill_rows = df_binned["CTDPRS"].isna()
    df_binned.loc[fill_rows, "CTDPRS"] = df_binned[fill_rows].index.to_numpy()
    df_binned.bfill(inplace=True)
    df_binned.loc[:, "interp_bool"] = False
    df_binned.loc[fill_rows, "interp_bool"] = True
    df_filled = _flag_backfill_data(df_binned).drop(columns="interp_bool")

    return df_filled.reset_index(drop=True)


[docs]def binning_df(df, p_column="CTDPRS", bin_size=2):
    """Calculate the bin-mean of each column in input dataframe

    Parameters
    ----------
    df : DataFrame
        Data to be bin-meaned
    p_column : str, optional
        Pressure column name to use for binning
    bin_size : int, optional
        Width of bins (in decibars)

    Returns
    -------
    df_out : DataFrame
        Bin-meaned data

    """
    if p_column not in df.columns:
        raise KeyError(f"{p_column} column missing from dataframe")

    p_max = np.ceil(df[p_column].max())
    labels = np.arange(0, p_max, bin_size)
    bin_edges = np.arange(0, p_max + bin_size, bin_size)
    df_out = df.copy()
    df_out.loc[:, "bins"] = pd.cut(
        df[p_column], bins=bin_edges, right=False, include_lowest=True, labels=labels
    )
    df_out.loc[:, p_column] = df_out["bins"].astype(float)

    return df_out.groupby("bins").mean()


def _fill_surface_data(df, bin_size=2):
    """Copy first scan from top of cast and propagate up to surface at bin centers"""
    df = df.copy(deep=True)
    p_min = np.floor(df["CTDPRS"].iloc[0])
    df_surface = pd.DataFrame({"CTDPRS": np.arange(bin_size / 2, p_min, bin_size)})
    df_surface["interp_bool"] = True
    df = df_surface.merge(df, on="CTDPRS", how="outer")
    df["interp_bool"].fillna(False, inplace=True)
    df = _flag_backfill_data(df).drop(columns="interp_bool")

    return df.bfill()


def _get_pressure_offset(start_vals, end_vals):
    """
    Finds unique values and calculate mean for pressure offset

    Parameters
    ----------
    start_vals : array_like
        Array of initial ondeck pressure values

    end_vals : array_like
        Array of ending ondeck pressure values
    Returns
    -------
    p_off : float
         Average pressure offset

    """
    p_start = pd.Series(np.unique(start_vals))
    p_end = pd.Series(np.unique(end_vals))
    p_start = p_start[p_start.notnull()]
    p_end = p_end[p_end.notnull()]
    p_off = p_start.mean() - p_end.mean()

    # JACKSON THINKS THIS METHOD SHOULD BE USED TO KEEP START END PAIRS
    #    p_df = pd.DataFrame()
    #    p_df['p_start'] = p_start
    #    p_df['p_end'] = p_end
    #    p_df = p_df[p_df['p_end'].notnull()]
    #    p_df = p_df[p_df['p_start'].notnull()]
    #    p_off = p_df['p_start'].mean() - p_df['p_end'].mean()
    ##########################################################

    p_off = np.around(p_off, decimals=4)

    return p_off


[docs]def apply_pressure_offset(df, p_col="CTDPRS"):
    # TODO: import p_col from config file
    """
    Calculate pressure offset using deck pressure log and apply it to the data.
    Pressure flag column is added with value 2, indicating the data are calibrated.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing column with pressure values
    p_col : str, optional
        Pressure column name in DataFrame (defaults to CTDPRS)

    Returns
    -------
    df : DataFrame
        DataFrame containing updated pressure values and a new flag column

    """
    p_log = pd.read_csv(
        cfg.dirs["logs"] + "ondeck_pressure.csv",
        dtype={"SSSCC": str},
        na_values="Started in Water",
    )
    p_offset = _get_pressure_offset(p_log.ondeck_start_p, p_log.ondeck_end_p)
    df[p_col] += p_offset
    df[p_col + "_FLAG_W"] = 2

    return df


[docs]def make_depth_log(time_df, threshold=80):
    # TODO: get column names from config file
    """
    Create depth log file from maximum depth of each station/cast in time DataFrame.
    If rosette does not get within the threshold distance of the bottom, returns NaN.

    Parameters
    ----------
    time_df : DataFrame
        DataFrame containing continuous CTD data
    threshold : int, optional
        Maximum altimeter reading to consider cast "at the bottom" (defaults to 80)

    """
    # TODO: make inputs be arraylike rather than dataframe
    df = time_df[["SSSCC", "CTDPRS", "GPSLAT", "ALT"]].copy().reset_index()
    df_group = df.groupby("SSSCC", sort=False)
    idx_p_max = df_group["CTDPRS"].idxmax()
    bottom_df = pd.DataFrame(
        data={
            "SSSCC": df["SSSCC"].unique(),
            "max_p": df.loc[idx_p_max, "CTDPRS"],
            "lat": df.loc[idx_p_max, "GPSLAT"],
            "alt": df.loc[idx_p_max, "ALT"],
        }
    )
    bottom_df.loc[bottom_df["alt"] > threshold, "alt"] = np.nan
    # pandas 1.2.1 ufunc issue workaround with pd.to_numpy()
    bottom_df["DEPTH"] = (
        (
            bottom_df["alt"]
            + np.abs(gsw.z_from_p(bottom_df["max_p"], bottom_df["lat"].to_numpy()))
        )
        .fillna(value=-999)
        .round()
        .astype(int)
    )
    bottom_df[["SSSCC", "DEPTH"]].to_csv(
        cfg.dirs["logs"] + "depth_log.csv", index=False
    )

    return True


[docs]def make_ssscc_list(fname="data/ssscc.csv"):
    """
    Attempt to automatically generate list of station/casts from raw files.
    """
    raw_files = Path(cfg.dirs["raw"]).glob("*.hex")
    ssscc_list = sorted([f.stem for f in raw_files])
    pd.Series(ssscc_list, dtype=str).to_csv(fname, header=None, index=False, mode="x")

    return ssscc_list


[docs]def get_ssscc_list(fname="data/ssscc.csv"):
    """
    Load in list of stations/casts to process.
    """
    ssscc_list = []
    with open(fname, "r") as filename:
        ssscc_list = [line.strip() for line in filename]

    return ssscc_list


[docs]def load_all_ctd_files(ssscc_list):
    """
    Load CTD files for station/cast list and merge into a dataframe.

    Parameters
    ----------
    ssscc_list : list of str
        List of stations to load

    Returns
    -------
    df_data_all : DataFrame
        Merged dataframe containing all loaded data

    """
    df_list = []
    for ssscc in ssscc_list:
        log.info("Loading TIME data for station: " + ssscc + "...")
        time_file = cfg.dirs["time"] + ssscc + "_time.pkl"
        time_data = pd.read_pickle(time_file)
        time_data["SSSCC"] = str(ssscc)
        time_data["dv_dt"] = oxy_fitting.calculate_dV_dt(
            time_data["CTDOXYVOLTS"], time_data["scan_datetime"]
        )
        df_list.append(time_data)
        # print("** Finished TIME data station: " + ssscc + " **")
    df_data_all = pd.concat(df_list, axis=0, sort=False)

    df_data_all["master_index"] = range(len(df_data_all))

    return df_data_all


[docs]def manual_backfill(df, p_cutoff, p_col="CTDPRS", flag_suffix="_FLAG_W"):
    """
    Overwrite values below cutoff pressure by backfilling the first data point past
    threshold upward to the surface. Backfilled data are flagged 6.

    Parameters
    ----------
    df : DataFrame
        Input data
    p_cutoff : float
        Cutoff pressure for backfilling
    p_col : str, optional
        Name of pressure column in df
    flag_suffix : str, optional
        Parameter suffix for data flags

    Returns
    -------
    df : DataFrame
        Input DataFrame with backfilled data
    """
    df = df.copy(deep=True)
    cols = df.columns.drop(p_col)
    df["interp_bool"] = df[p_col] < p_cutoff
    df.loc[df["interp_bool"], cols] = np.nan
    df = _flag_backfill_data(df).drop(columns="interp_bool")

    return df.bfill()


def _flag_backfill_data(
    df, p_col="CTDPRS", flag_bool_col="interp_bool", flag_suffix="_FLAG_W"
):
    """Flag data columns which have been interpolated with flag 6."""
    for col in df.columns:
        if flag_suffix in col:
            df.loc[df[flag_bool_col], col] = 6

    return df


[docs]def export_ct1(df, ssscc_list):
    """
    Export continuous CTD (i.e. time) data to data/pressure/ directory as well as
    adding quality flags and removing unneeded columns.

    Parameters
    ----------
    df : DataFrame
        Continuous CTD data
    ssscc_list : list of str
        List of stations to export

    Returns
    -------

    Notes
    -----
    Needs depth_log.csv and manual_depth_log.csv to run successfully

    """
    log.info("Exporting CTD files")

    # initial flagging (some of this should be moved)
    # TODO: lump all uncalibrated together; smart flagging like ["CTD*_FLAG_W"] = 1
    # TODO: may not always have these channels so don't hardcode them in!
    df["CTDFLUOR_FLAG_W"] = 1
    df["CTDXMISS_FLAG_W"] = 1
    # df["CTDBACKSCATTER_FLAG_W"] = 1

    # rename outputs as defined in user_settings.yaml
    for param, attrs in cfg.ctd_outputs.items():
        if param not in df.columns:
            df.rename(columns={attrs["sensor"]: param}, inplace=True)

    # check that all columns are there
    try:
        df[cfg.ctd_col_names]
        # this is lazy, do better
    except KeyError as err:
        log.info("Column names not configured properly... attempting to correct")
        bad_cols = err.args[0].split("'")[1::2]  # every other str is a column name
        for col in bad_cols:
            if col.endswith("FLAG_W"):
                log.warning(col + " missing, flagging with 9s")
                df[col] = 9
            else:
                log.warning(col + " missing, filling with -999s")
                df[col] = -999

    df["SSSCC"] = df["SSSCC"].astype(str).copy()
    cast_details = pd.read_csv(
        # cfg.dirs["logs"] + "cast_details.csv", dtype={"SSSCC": str}
        cfg.dirs["logs"] + "bottom_bottle_details.csv",
        dtype={"SSSCC": str},
    )
    depth_df = pd.read_csv(
        cfg.dirs["logs"] + "depth_log.csv", dtype={"SSSCC": str}, na_values=-999
    ).dropna()
    try:
        manual_depth_df = pd.read_csv(
            cfg.dirs["logs"] + "manual_depth_log.csv", dtype={"SSSCC": str}
        )
    except FileNotFoundError:
        # TODO: add logging; look into inheriting/extending a class to add features
        log.warning("manual_depth_log.csv not found... duplicating depth_log.csv")
        manual_depth_df = depth_df.copy()  # write manual_depth_log as copy of depth_log
        manual_depth_df.to_csv(cfg.dirs["logs"] + "manual_depth_log.csv", index=False)
    full_depth_df = pd.concat([depth_df, manual_depth_df])
    full_depth_df.drop_duplicates(subset="SSSCC", keep="first", inplace=True)

    for ssscc in ssscc_list:

        time_data = df[df["SSSCC"] == ssscc].copy()
        time_data = pressure_sequence(time_data)
        # switch oxygen primary sensor to rinko
        # if int(ssscc[:3]) > 35:
        print(f"Using Rinko as CTDOXY for {ssscc}")
        time_data.loc[:, "CTDOXY"] = time_data["CTDRINKO"]
        time_data.loc[:, "CTDOXY_FLAG_W"] = time_data["CTDRINKO_FLAG_W"]
        time_data = time_data[cfg.ctd_col_names]
        # time_data = time_data.round(4)
        time_data = time_data.where(~time_data.isnull(), -999)  # replace NaNs with -999

        # force flags back to int (TODO: make flags categorical)
        for col in time_data.columns:
            if col.endswith("FLAG_W"):
                time_data[col] = time_data[col].astype(int)

        try:
            depth = full_depth_df.loc[full_depth_df["SSSCC"] == ssscc, "DEPTH"].iloc[0]
        except IndexError:
            log.warning(f"No depth logged for {ssscc}, setting to -999")
            depth = -999

        # get cast_details for current SSSCC
        cast_dict = cast_details[cast_details["SSSCC"] == ssscc].to_dict("records")[0]
        b_datetime = (
            datetime.fromtimestamp(cast_dict["bottom_time"], tz=timezone.utc)
            .strftime("%Y%m%d %H%M")
            .split(" ")
        )
        # TODO: yo-yo casts are an edge case where this may be different
        btm_lat = cast_dict["latitude"]
        btm_lon = cast_dict["longitude"]

        now = datetime.now(timezone.utc)
        file_datetime = now.strftime("%Y%m%d")  # %H:%M")
        file_datetime = file_datetime + "ODFSIO"
        # TODO: only "cast" needs to be int; "station" is explicitly allowed to incl.
        # letters/etc. Moving from SSSCC to station & cast fields will be beneficial
        with open(f"{cfg.dirs['pressure']}{ssscc}_ct1.csv", "w+") as f:
            # put in logic to check columns?
            # number_headers should be calculated, not defined
            ctd_header = (  # this is ugly but prevents tabs before label
                f"CTD,{file_datetime}\n"
                f"NUMBER_HEADERS = 11\n"
                f"EXPOCODE = {cfg.expocode}\n"
                f"SECT_ID = {cfg.section_id}\n"
                f"STNNBR = {ssscc[:3]}\n"  # STNNBR = SSS
                f"CASTNO = {ssscc[3:]}\n"  # CASTNO = CC
                f"DATE = {b_datetime[0]}\n"
                f"TIME = {b_datetime[1]}\n"
                f"LATITUDE = {btm_lat:.4f}\n"
                f"LONGITUDE = {btm_lon:.4f}\n"
                f"INSTRUMENT_ID = {cfg.ctd_serial}\n"
                f"DEPTH = {depth:.0f}\n"
            )
            f.write(ctd_header)
            np.asarray(cfg.ctd_col_names).tofile(f, sep=",", format="%s")
            f.write("\n")
            np.asarray(cfg.ctd_col_units).tofile(f, sep=",", format="%s")
            f.write("\n")
            time_data.to_csv(f, header=False, index=False)
            f.write("END_DATA")