Source code for pyspedas.tplot_tools.importers.cdf_to_tplot

# Copyright 2020 Regents of the University of Colorado. All Rights Reserved.
# Released under the MIT license.
# This software was developed at the University of Colorado's Laboratory for
# Atmospheric and Space Physics.
# Verify current version before use at: https://github.com/MAVENSDC/PyTplot

import cdflib
import logging
import re
import numpy as np
import xarray as xr
import datetime
from datetime import timedelta
from pyspedas.tplot_tools import store_data
from pyspedas.tplot_tools import tplot
from pyspedas.tplot_tools import options
import pyspedas
import copy
from collections.abc import Iterable



[docs]
def cdf_to_tplot(filenames, mastercdf=None, varformat=None, exclude_format=None, get_support_data=False, get_metadata=False,
                 get_ignore_data=False, string_encoding='ascii',
                 prefix='', suffix='', plot=False, merge=False,
                 center_measurement=False, notplot=False, varnames=None):
    """
    This function will automatically create tplot variables from CDF files.  In general, the files should be
    ISTP compliant for this importer to work.  Each variable is read into a new tplot variable (a.k.a an xarray DataArray),
    and all associated file/variable metadata is read into the attrs dictionary.

    .. note::
        Variables must have an attribute named "VAR_TYPE". If the attribute entry
        is "data" (or "support_data"), then they will be added as tplot variables.
        Additionally, data variables should have attributes named "DEPEND_TIME" or
        "DEPEND_0" that describes which variable is x axis.  If the data is 2D,
        then an attribute "DEPEND_1" must describe which variable contains the
        secondary axis.

    Parameters:
        filenames : str/list of str
            The file names and full paths of CDF files.
        mastercdf : str
            The file name of a master CDF to be used, if any
        varformat : str or list[str]
            The file variable formats to load into tplot.  Wildcard character
            "*" is accepted.  By default, all variables are loaded in.
        exclude_format : str or list[str]
            The file variable formats to exclude from loading into tplot.  Wildcard character
            "*" is accepted. By default, no variables are excluded.
        get_support_data: bool
            Data with an attribute "VAR_TYPE" with a value of "support_data"
            will be loaded into tplot.  By default, only loads in data with a
            "VAR_TYPE" attribute of "data".
        prefix: str
            The tplot variable names will be given this prefix.  By default,
            no prefix is added.
        suffix: str
            The tplot variable names will be given this suffix.  By default,
            no suffix is added.
        plot: bool
            The data is plotted immediately after being generated.  All tplot
            variables generated from this function will be on the same plot.
        merge: bool
            If True, then data from different cdf files will be merged into
            a single tplot variable.
        get_ignore_data: bool
            Data with an attribute "VAR_TYPE" with a value of "ignore_data"
            will be loaded into tplot.  By default, only loads in data with a
            "VAR_TYPE" attribute of "data".
        center_measurement: bool
            If True, the CDF epoch variables are time-shifted to the middle
            of the accumulation interval by their DELTA_PLUS_VAR and
            DELTA_MINUS_VAR variable attributes
        notplot: bool
            If True, then data are returned in a hash table instead of
            being stored in tplot variables (useful for debugging, and
            access to multi-dimensional data products)
        varnames: str or list of str
            Load these variables only. If None or [] or ['*'], then load everything.

    Returns:
        List of tplot variables created (unless notplot keyword is used).
    """

    stored_variables = []
    epoch_cache = {}
    output_table = {}
    metadata = {}

    new_cdflib = False
    if cdflib.__version__ > "0.4.9":
        new_cdflib = True
        logging.debug("Using new version of cdflib (%s)", cdflib.__version__)
    else:
        new_cdflib = False
        logging.debug("Using old version of cdflib (%s)", cdflib.__version__)

    if prefix is None:
        prefix = ''

    if suffix is None:
        suffix = ''

    # When doing varname, varformat, or exclude_format checks, do we need to take prefixes or suffixes into account?
    check_pre_suff = False
    if prefix != '' or suffix != '':
        check_pre_suff = True

    # If nontrivial varnames or varformat are explicitly supplied, load the requested data whether or not
    # it's marked as support_data in the CDF.

    if varformat is not None:
        get_support_data = True

    if varnames is None:
        varnames = []

    if not isinstance(varnames, list):
        varnames = [varnames]

    if len(varnames) > 0:
        get_support_data = True
        if '*' in varnames:
            varnames = []

    # pyspedas.tplot_tools.data_quants = {}
    if isinstance(filenames, str):
        filenames = [filenames]
    elif isinstance(filenames, list):
        pass
    else:
        logging.warning("Invalid filenames input. Must be string or list of strings.")
        return stored_variables

    var_type = ['data']
    if varformat is None:
        varformat = ".*"
    if get_support_data:
        var_type.append('support_data')
    if get_metadata:
        var_type.append('metadata')
    if get_ignore_data:
        var_type.append('ignore_data')

    # Replace lists with regex alternation
    if isinstance(varformat, list):
        varformat = '|'.join(varformat)
    # Replace spaces with regex alternation
    if ' ' in varformat:
        explode=varformat.split()
        varformat='|'.join(explode)

    varformat = varformat.replace("*", ".*")
    var_regex = re.compile(varformat)

    if exclude_format is not None:
        # Replace spaces and lists with regex alternation as we did for varformat
        if isinstance(exclude_format, list):
            exclude_format = '|'.join(exclude_format)
        # Replace spaces with regex alternation
        if ' ' in exclude_format:
            explode=exclude_format.split()
            exclude_format='|'.join(explode)

        exclude_format = exclude_format.replace("*",".*")
        exclude_regex = re.compile(exclude_format)
    else:
        exclude_regex = None


    # This step may not be appropriate if the lexicographic sort does not correspond to a time sort. (For example,
    # if filenames contain orbit numbers rather than dates, and no leading zeroes are used.)  JWL 2023-03-17

    filenames.sort()

    # Get metadata from master CDF, if provided
    # In IDL, cdf2tplot uses the first file provided as a de-facto master CDF.
    # in pyspedas, cdf_to_tplot can do things like loading data from all 4 MMS probes in a single call.
    # So, we can't always use the first CDF in the list, because it may not apply to other files in the list.
    # Therefore, we supply a master CDF, if needed, in a separate argument. JWL 2023-03-17

    if not mastercdf is None:
        mastercdf_flag = True
        logging.debug('Processing master CDF %s', mastercdf)
        master_cdf_file = cdflib.CDF(mastercdf)
        master_cdf_file.string_encoding = string_encoding
        master_cdf_info = master_cdf_file.cdf_info()
        if new_cdflib:
            master_cdf_variables = master_cdf_info.rVariables + master_cdf_info.zVariables
        else:
            master_cdf_variables = master_cdf_info['rVariables'] + master_cdf_info['zVariables']

        logging.debug("master_cdf_variables: " + str(master_cdf_variables))
    else:
        mastercdf_flag = False

    logging.debug("Input filenames: " + str(filenames))
    for filename in filenames:
        logging.debug('Processing filename %s', filename)
        cdf_file = cdflib.CDF(filename)
        cdf_file.string_encoding = string_encoding
        cdf_info = cdf_file.cdf_info()
        if new_cdflib:
            all_cdf_variables = cdf_info.rVariables + cdf_info.zVariables
        else:
            all_cdf_variables = cdf_info['rVariables'] + cdf_info['zVariables']

        logging.debug("all_cdf_variables: " + str(all_cdf_variables))
        if not mastercdf_flag:
            # If not using a master CDF, each CDF is its own master
            master_cdf_file = cdf_file
            mastercdf = filename
            master_cdf_variables = all_cdf_variables
        # User defined variables.
        if len(varnames) > 0:
            load_cdf_variables = [value for value in varnames if value in all_cdf_variables]
            if check_pre_suff:
                pre_suff = [value for value in all_cdf_variables if prefix+value+suffix in varnames]
                load_cdf_variables.extend(pre_suff)
        else:
            load_cdf_variables = all_cdf_variables

        try:
            gatt = master_cdf_file.globalattsget()
        except:
            logging.warning('Unable to get global attributes for filename %s', mastercdf)
            gatt = {}

        for var in load_cdf_variables:
            if not re.match(var_regex, var) and (not check_pre_suff or not re.match(var_regex, prefix+var+suffix)):
                logging.debug("Variable %s does not match varformat, skipping", var)
                continue
            elif exclude_regex is not None and (re.match(exclude_regex, var) or (check_pre_suff and re.match(exclude_regex, prefix+var+suffix))):
                logging.debug("Variable %s matches exclude_format, skipping", var)
                continue
            logging.debug('Processing variable attributes for %s', var)
            try:
                var_atts = master_cdf_file.varattsget(var)
            except ValueError:
                logging.warning("Unable to get variable attributes for %s in file %s, skipping", var, mastercdf)
                continue

            if 'VIRTUAL' in var_atts:
                this_virtual = var_atts['VIRTUAL'].lower()
                if this_virtual=="true":
                    logging.debug("Skipping virtual variable %s",var)
                    continue
            elif 'FUNCT' in var_atts and 'COMPONENT_0' in var_atts:
                logging.info("Variable %s not marked as VIRTUAL, but has FUNCT and COMPONENT_0 attributes; skipping", var)
                continue

            if 'VAR_TYPE' in var_atts:
                this_var_type = var_atts['VAR_TYPE'].lower()
            elif 'PARAMETER_TYPE' in var_atts:
                this_var_type = var_atts['PARAMETER_TYPE'].lower()
            else:
                # 'VAR_TYPE' and 'PARAMETER_TYPE' not found in the variable attributes
                logging.info('No VAR_TYPE or PARAMETER_TYPE attributes defined for variable %s, skipping', var)
                continue

            if this_var_type in var_type:
                var_properties = master_cdf_file.varinq(var)
                var_properties_data_cdf = cdf_file.varinq(var)

                # Find data name and if it is already in stored variables
                if 'TPLOT_NAME' in var_atts:
                    var_name = prefix + var_atts['TPLOT_NAME'] + suffix
                else:
                    var_name = prefix + var + suffix

                # Is this variable marked as non-record-varying?  This may
                # differ between the data and master CDFs.

                if new_cdflib:
                    rec_vary_data \
                        = var_properties_data_cdf.Rec_Vary
                    rec_vary_master \
                        = var_properties.Rec_Vary
                else:
                    rec_vary_data \
                        = var_properties_data_cdf["Rec_Vary"]
                    rec_vary_master \
                        = var_properties["Rec_Vary"]
                if (rec_vary_master != rec_vary_data):
                    logging.warning("Master and data CDFs have different values for Rec_Vary property on variable %s, using %s from master CDF.", var, rec_vary_master)
                rec_vary = rec_vary_master

                nrv_has_times = False
                if "DEPEND_TIME" in var_atts:
                    x_axis_var = var_atts["DEPEND_TIME"]
                    if not rec_vary:
                        logging.warning("Variable %s is marked non-record-varying, but has DEPEND_TIME attribute",var)
                        nrv_has_times = True
                elif "DEPEND_0" in var_atts:
                    x_axis_var = var_atts["DEPEND_0"]
                    if not rec_vary:
                        logging.warning("Variable %s is marked non-record-varying, but has DEPEND_0 attribute",var)
                        nrv_has_times = True

                else:
                    # non-record varying variables (NRVs)
                    # added by egrimes, 13Jan2021
                    # here we assume if there isn't a DEPEND_TIME or DEPEND_0, there are no other depends
                    logging.debug(
                        'No DEPEND_TIME or DEPEND_0 attributes found for variable %s, filename %s . Treating as non-record-varying.',
                        var, filename)
                    if rec_vary and ('epoch' not in var.lower()):
                        logging.warning("Variable %s is marked as record-varying, but no DEPEND_TIME or DEPEND_0 attributes found. Treating as non-record-varying.",var)
                    try:
                        ydata = cdf_file.varget(var)
                    except:
                        logging.debug('Unable to get ydata for non-record-varying variable %s, filename %s', var, filename)
                        continue

                    if ydata is None:
                        continue

                    # since NRVs don't vary with time, they shouldn't vary across files
                    output_table[var_name] = {'y': ydata}

                    continue

                if x_axis_var not in master_cdf_variables:
                    logging.warning('Variable %s timestamp variable %s not found, skipping', var, x_axis_var)
                    continue

                if new_cdflib:
                    data_type_description \
                        = cdf_file.varinq(x_axis_var).Data_Type_Description
                else:
                    data_type_description \
                        = cdf_file.varinq(x_axis_var)["Data_Type_Description"]


                if epoch_cache.get(filename + x_axis_var) is None:
                    delta_plus_var = 0.0
                    delta_minus_var = 0.0
                    delta_time = 0.0

                    # Skip variables with ValueErrors.
                    try:
                        xdata = cdf_file.varget(x_axis_var)
                        epoch_var_atts = cdf_file.varattsget(x_axis_var)
                    except ValueError:
                        logging.debug('Problem getting data for variable %s, filename %s', var, filename)
                        continue

                    # check for DELTA_PLUS_VAR/DELTA_MINUS_VAR attributes
                    if center_measurement:
                        if 'DELTA_PLUS_VAR' in epoch_var_atts:
                            delta_plus_var = cdf_file.varget(epoch_var_atts['DELTA_PLUS_VAR'])
                            delta_plus_var_att = cdf_file.varattsget(epoch_var_atts['DELTA_PLUS_VAR'])

                            # check if a conversion to seconds is required
                            if 'SI_CONVERSION' in delta_plus_var_att:
                                si_conv = delta_plus_var_att['SI_CONVERSION']
                                delta_plus_var = delta_plus_var.astype(float) * np.float64(si_conv.split('>')[0])
                            elif 'SI_CONV' in delta_plus_var_att:
                                si_conv = delta_plus_var_att['SI_CONV']
                                delta_plus_var = delta_plus_var.astype(float) * np.float64(si_conv.split('>')[0])

                        if 'DELTA_MINUS_VAR' in epoch_var_atts:
                            delta_minus_var = cdf_file.varget(epoch_var_atts['DELTA_MINUS_VAR'])
                            delta_minus_var_att = cdf_file.varattsget(epoch_var_atts['DELTA_MINUS_VAR'])

                            # check if a conversion to seconds is required
                            if 'SI_CONVERSION' in delta_minus_var_att:
                                si_conv = delta_minus_var_att['SI_CONVERSION']
                                delta_minus_var = delta_minus_var.astype(float) * np.float64(si_conv.split('>')[0])
                            elif 'SI_CONV' in delta_minus_var_att:
                                si_conv = delta_minus_var_att['SI_CONV']
                                delta_minus_var = delta_minus_var.astype(float) * np.float64(si_conv.split('>')[0])

                        # sometimes these are specified as arrays
                        if isinstance(delta_plus_var, np.ndarray) and isinstance(delta_minus_var, np.ndarray):
                            delta_time = (delta_plus_var - delta_minus_var) / 2.0
                        else:  # and sometimes constants
                            if delta_plus_var != 0.0 or delta_minus_var != 0.0:
                                delta_time = (delta_plus_var - delta_minus_var) / 2.0

                if epoch_cache.get(filename + x_axis_var) is None:
                    if ('CDF_TIME' in data_type_description) or \
                            ('CDF_EPOCH' in data_type_description):
                        # the old way:
                        # store the times as unix times, and cache them
                        # xdata = cdfepoch.unixtime(xdata)
                        # epoch_cache[filename+x_axis_var] = np.array(xdata)+delta_time
                        # the new way:
                        # store and cache the datetime objects directly
                        # and delay conversion to unix times until get_data is called

                        # Cluster uses multidimensional DEPEND_0 values on some "caveat" and "dsettings" variables. This will cause the
                        # xdata[0] < 0.0 test to crash
                        if len(xdata.shape) > 1:
                            logging.warning("CDF DEPEND_0 attribute %s for variable %s is multidimensional with shape %s, skipping", var_atts['DEPEND_0'], var, str(xdata.shape),)
                            continue
                        # Cluster apparently uses (-1.0e-31) as time tag fill values??  Better check...
                        # if xdata[0] < 0.0:
                        #    logging.warning("CDF time tag %e for variable %s cannot be converted to datetime, skipping",xdata[0],var)
                        #    continue

                        # Check for all-fill Cluster times
                        # NOTE: At least for Cluster onboard moments downloaded from CSA,
                        # there is a FILLVAL specified on the time_tags variable, but
                        # it doesn't seem to match the actual fill value we're seeing.
                        # The time_tags FILLVAL attribute has -1e+31 (very large negative value) while
                        # the ones we're actually seeing in the data are -1e-31 (very small negative value)
                        is_cluster_fill = xdata == -1.0e-31
                        if is_cluster_fill.all():
                            logging.warning("Time variable %s for data variable %s has values all equal to -1.0e-31", x_axis_var, var)
                        # Check if time variable has a FILLVAL attribute defined.
                        if 'FILLVAL' in epoch_var_atts:
                            fillval = epoch_var_atts['FILLVAL']
                            is_fillval = xdata == fillval
                            if is_fillval.any():
                                logging.warning("Time variable %s for data variable %s contain at least one time equal to FILLVAL (%e)", x_axis_var, var, fillval)
                        xdata = np.array(cdflib.cdfepoch.to_datetime(xdata))
                        if isinstance(xdata[0],datetime.datetime):
                            # old cdflib < 1.0.0 returns datetime.datetime objects
                            if isinstance(delta_time, np.ndarray) or isinstance(delta_time, list):
                                delta_t = np.array([timedelta(seconds=dtime) for dtime in delta_time])
                            else:
                                delta_t = timedelta(seconds=delta_time)
                        else:
                            # new cdflib >= 1.0.0 returns np.datetime64 objects
                            if isinstance(delta_time, np.ndarray) or isinstance(delta_time, list):
                                delta_t = np.array([np.timedelta64(int(dtime*1e9),'ns') for dtime in delta_time])
                            else:
                                delta_t = np.timedelta64(int(delta_time*1e9),'ns')

                        epoch_cache[filename + x_axis_var] = xdata + delta_t
                else:
                    xdata = epoch_cache[filename + x_axis_var]

                try:
                    ydata = cdf_file.varget(var)
                except:
                    logging.warning('Unable to get ydata for variable %s', var)
                    continue

                if ydata is None:
                    logging.info('No ydata for variable %s', var)
                    continue
                elif np.isscalar(ydata):
                    # Cluster sets FILLVAL attributes on scalar quantities (!) so we need to check...
                    # This can happen for density variables in the Cluster onboard moments loaded from CSA.
                    # It may be due to no valid data being available, but CSA makes a CDF with a single
                    # time value and data point that are both fillvals.
                    logging.info('ydata for variable %s is a scalar, converting to numpy array',var)
                    # We won't worry here about how many dimensions it's supposed to have.  We'll fix that below if needed.  For now, we just want to be
                    # sure it's not a scalar.
                    ydata = np.array(ydata)
                if "FILLVAL" in var_atts:
                    if new_cdflib:
                        thisvar_dtd = var_properties.Data_Type_Description
                    else:
                        thisvar_dtd = var_properties["Data_Type_Description"]

                    fillval = var_atts['FILLVAL']

                    if (thisvar_dtd == 'CDF_FLOAT' or
                            thisvar_dtd == 'CDF_REAL4' or
                            thisvar_dtd == 'CDF_DOUBLE' or
                            thisvar_dtd == 'CDF_REAL8'):

                        is_fill_cond = ydata == fillval
                        if is_fill_cond.all():
                            logging.warning("Floating point data values for variable %s are all fillval (%e)",var, fillval)
                            ydata[is_fill_cond] = np.nan
                        elif is_fill_cond.any():
                            ydata[is_fill_cond] = np.nan
                        else:
                            # No fillvals, nothing to do
                            pass
                    elif thisvar_dtd[:7] == 'CDF_INT':
                        # NaN is only valid for floating point data
                        # but we still need to handle FILLVAL's for
                        # integer data, so we'll just set those to 0
                        is_fill_cond = ydata == fillval
                        if is_fill_cond.all():
                            logging.warning("Integer data values for variable %s are all fillval (%d).",var, fillval)
                            ydata[is_fill_cond] = 0
                        elif is_fill_cond.any():
                            ydata[is_fill_cond] = 0
                        else:
                            # No fillvals, nothing to do
                            pass

                # Check dimensions of ydata to see if a leading time dimension has been lost
                # This seems to happen with some Cluster CDFs, at least the ones served by CSA,
                # if a variable only has a single timestamp. For example, the CP_CIS-HIA_ONBOARD_MOMENTS datatype, as seen
                # in test_load_csa_mom_data.

                num_times = len(xdata)
                ydims = ydata.shape
                y_ndims = len(ydata.shape)
                if num_times == 1:
                    if y_ndims == 0:
                        logging.warning("Restoring missing time dimension for scalar-valued variable %s", var)
                        ydata = ydata.reshape(1)
                        ydims = ydata.shape
                        y_ndims = len(ydata.shape)
                    elif ydims[0] != 1:
                        logging.warning("Restoring missing time dimension for array-valued variable %s", var)
                        ydata = ydata.reshape(1,*ydims)
                        ydims = ydata.shape
                        y_ndims = len(ydata.shape)
                elif nrv_has_times and (num_times > 2) and (ydims[0] != num_times):
                    # This case is primarily to catch some MMS FEEPS support variables
                    # that's marked NRV, but has a DEPEND_0.  Here, we ignore the times,
                    # make the tplot variable from just the Y data, and skip the rest of
                    # the metadata processing for this variable.
                    logging.warning("Ignoring times for probably non-record-varying variable %s", var_name)
                    output_table[var_name] = {'y': ydata}
                    continue

                tplot_data = {'x': xdata, 'y': ydata}

                # We want to know if this is a spectrogram or not.  If not, don't make
                # "v", "v1", "v2" entries. Technically, a vector quantity should have a DEPEND_1
                # with numeric values.  Most of the time, they are provided as strings, or simply
                # omitted.  But if we made a "v" variable for it, it would break a lot of code
                # that only expects to get (times, data) back from a get_data call on a non-spectral variable.

                is_spectrogram = False
                if 'DISPLAY_TYPE' in var_atts:
                    disp_type = var_atts['DISPLAY_TYPE'].lower()
                    if "spect" in disp_type:
                        is_spectrogram = True

                # Data may depend on other data in the CDF.
                depend_1 = None
                depend_2 = None
                depend_3 = None
                if "DEPEND_1" in var_atts:
                    if y_ndims < 2:
                        logging.warning("Variable %s has only %d dimension (including time), but has a DEPEND_1 attribute. Removing attribute.", var, y_ndims)
                        depend_1 = None
                    elif var_atts["DEPEND_1"] in master_cdf_variables:
                        try:
                            # Check for correct shape, matching time and data dimensions
                            dep_name = var_atts["DEPEND_1"]
                            depend_1 = np.array(master_cdf_file.varget(dep_name))
                            # String-valued DEPEND_1 handling
                            # This is not strictly ISTP compliant, but it's extremeley common. For
                            # example, vector-valued data will often have DEPEND_1 values that are more like
                            # labels, i.e. ['x', 'y', 'z']
                            # Previous versions of cdf_to_tplot simply ignored string-valued DEPEND_N.
                            # But some missions (e.g. ERG) really do want string values as 'v1' or 'v2' tags.
                            #
                            # The situation is further complicated by the fact that cdflib seems to introduce
                            # extra leading or trailing dimensions on string-valued variables.
                            #
                            # As of cdflib version 1.3.3, it seems the issue has been corrected.  I will leave
                            # the test and warning enabled, in case of a regression later.  JWL 2025-04-04
                            #
                            if depend_1 is not None and depend_1.dtype.type is np.str_:
                                # Get the original array dimensions from the variable properties
                                dp_props = master_cdf_file.varinq(dep_name)
                                if new_cdflib:
                                    orig_dimensions = dp_props.Dim_Sizes
                                else:
                                    orig_dimensions = dp_props['Dim_Sizes']
                                reshape_dim = tuple(orig_dimensions)
                                if depend_1.shape != reshape_dim:
                                    logging.warning('Variable %s has shape %s. Its DEPEND_1 attribute %s is string-valued, and cdflib returned dimensions %s which do not match original dimensions %s.',var, ydata.shape, dep_name, depend_1.shape, reshape_dim)
                                    if len(depend_1.shape) == 1 and depend_1.shape[0] == ydata.shape[1]:
                                        logging.warning('Returned dimensions are a better match than original, no reshaping required')
                                    elif len(reshape_dim) == 1 and reshape_dim[0] == ydata.shape[1]:
                                        logging.warning('Original dimensions are a better match to data dimensions, reshaping.  Updating to a more recent version of cdflib may get rid of this warning.')
                                        depend_1 = np.reshape(depend_1, reshape_dim)
                                    else:
                                        logging.warning('Neither original nor returned dimensions match data shape, ignoring this DEPEND_1.')
                                        depend_1 = None
                                pass
                            dep_dims = depend_1.shape
                            dep_ndims = len(dep_dims)
                            if dep_ndims == 0:
                                logging.warning("Variable %s DEPEND_1 attribute %s is zero-dimensional, Removing attribute.", var, dep_name)
                                depend_1 = None
                            elif dep_ndims == 1:
                                # Not time varying
                                if dep_dims[0] != ydims[1]:
                                    logging.warning("Variable %s DEPEND_1 attribute %s has length %d, but corresponding data dimension has length %d. Removing attribute.",var,dep_name,dep_dims[0],ydims[1])
                                    depend_1 = None
                            elif dep_ndims == 2:
                                # time-varying or otherwise multidimensional
                                if dep_dims[0] != num_times:
                                    logging.warning("Variable %s is 2-dimensional, but first dimension of DEPEND_1 attribute %s has size %d versus num_times %d. Attribute will be kept (for now).",var,dep_name,dep_dims[0], num_times)
                                    if dep_dims[0] == 1 and dep_dims[1] == ydims[1]:
                                        # RBSP EMPHISIS HFR_Spectra has this
                                        logging.warning("Variable %s DEPEND_1 attribute %s has dimensions 1 x y_dims[1]; reshaping to 1-D array.", var, dep_name)
                                        depend_1 = np.reshape(depend_1, (ydims[1],))
                                    # Or, it could be ERG HEP omniflux data with an extra (length-2) dimension as upper/lower bounds.
                                    # So for now, we'll allow it.
                                    pass
                                if dep_dims[1] != ydims[1]:
                                    # ERG XEP seems to make a 9x2 rather than a 2x9 array
                                    logging.warning("Variable %s is 2-dimensional, but second dimension of DEPEND_1 attribute %s has data length %d, but corresponding data dimension has length %d. Attribute will be kept (for now).",var,dep_name,dep_dims[1],ydims[1])
                                    #depend_1 = None
                                    pass
                            else:
                                # Too many dimensions
                                # ERG LEPE has time dependent DEPEND_1 with an extra dimension for upper/lower limits, so
                                # we need to allow this for now, or at least add a flag to skip this check.
                                logging.warning("Variable %s DEPEND_1 attribute %s has too many dimensions (%d). Keeping extra dimensions (for now).",var,dep_name,dep_ndims)
                                #depend_1 = None
                                pass

                        except ValueError:
                            logging.warning('Unable to get DEPEND_1 variable %s while processing %s',
                                            var_atts["DEPEND_1"], var)
                            pass
                if "DEPEND_2" in var_atts:
                    if y_ndims < 3:
                        logging.warning("Variable %s has only %d dimensions (including time), but has a DEPEND_2 attribute. Removing attribute.", var, y_ndims)
                        depend_2 = None
                    elif var_atts["DEPEND_2"] in master_cdf_variables:

                        try:
                            # Check for correct shape, matching time and data dimensions
                            dep_name = var_atts["DEPEND_2"]
                            depend_2 = np.array(master_cdf_file.varget(dep_name))
                            # String-valued DEPEND_N handling
                            # This is not strictly ISTP compliant, but it's extremeley common. For
                            # example, vector-valued data will often have DEPEND_1 values that are more like
                            # labels, i.e. ['x', 'y', 'z']
                            # Previous versions of cdf_to_tplot simply ignored string-valued DEPEND_N.
                            # But some missions (e.g. ERG) really do want string values as 'v1' or 'v2' tags.
                            #
                            # The situation is further complicated by the fact that cdflib seems to introduce
                            # extra leading or trailing dimensions on string-valued variables.
                            if depend_2 is not None and depend_2.dtype.type is np.str_:
                                # Get the original array dimensions from the variable properties
                                dp_props = master_cdf_file.varinq(dep_name)
                                if new_cdflib:
                                    orig_dimensions = dp_props.Dim_Sizes
                                else:
                                    orig_dimensions = dp_props['Dim_Sizes']
                                reshape_dim = tuple(orig_dimensions)
                                depend_2 = np.reshape(depend_2, reshape_dim)
                                #depend_2 = None
                                pass

                            dep_dims = depend_2.shape
                            dep_ndims = len(dep_dims)
                            if dep_ndims == 0:
                                logging.warning("Variable %s DEPEND_2 attribute %s is zero-dimensional. Removing attribute.", var,
                                                dep_name)
                                depend_2 = None
                            elif dep_ndims == 1:
                                # Not time varying
                                if dep_dims[0] != ydims[2]:
                                    logging.warning(
                                        "Variable %s DEPEND_2 attribute %s has length %d, but corresponding data dimension has length %d. Removing attribute.",
                                        var, dep_name, dep_dims[0], ydims[2])
                                    depend_2 = None
                            elif dep_ndims == 2:
                                # time-varying or otherwise multidimensional
                                if dep_dims[0] != num_times:
                                    logging.warning(
                                        "Variable %s multidimensional DEPEND_2 attribute %s has %d elements in first dimension, but data has %d times. Removing attribute.",
                                        var, dep_name, dep_dims[0], num_times)
                                    depend_2 = None
                                if dep_dims[1] != ydims[2]:
                                    logging.warning(
                                        "Variable %s multidimensional DEPEND_2 attribute %s has %d elements in second dimension, but corresponding data dimension has length %d. Removing attribute.",
                                        var, dep_name, dep_dims[1], ydims[2])
                                    depend_2 = None
                            else:
                                # Too many dimensions
                                logging.warning(
                                    "Variable %s DEPEND_2 attribute %s has too many dimensions (%d). Removing attribute.",
                                    var, dep_name, dep_ndims)
                                depend_2 = None
                        except ValueError:
                            logging.warning('Unable to get DEPEND_2 variable %s while processing %s',
                                            var_atts["DEPEND_2"], var)
                if "DEPEND_3" in var_atts:
                    if y_ndims < 4:
                        # TWINS imager data has this
                        logging.warning("Variable %s has only %d dimensions (including time), but has a DEPEND_3 attribute. Removing attribute.", var, y_ndims)
                        depend_3 = None
                    elif var_atts["DEPEND_3"] in master_cdf_variables:
                        try:
                            # Check for correct shape, matching time and data dimensions
                            dep_name = var_atts["DEPEND_3"]
                            depend_3 = np.array(master_cdf_file.varget(dep_name))
                            # String-valued DEPEND_N handling
                            # This is not strictly ISTP compliant, but it's extremeley common. For
                            # example, vector-valued data will often have DEPEND_1 values that are more like
                            # labels, i.e. ['x', 'y', 'z']
                            # Previous versions of cdf_to_tplot simply ignored string-valued DEPEND_N.
                            # But some missions (e.g. ERG) really do want string values as 'v1' or 'v2' tags.
                            #
                            # The situation is further complicated by the fact that cdflib seems to introduce
                            # extra leading or trailing dimensions on string-valued variables.
                            if depend_3 is not None and depend_3.dtype.type is np.str_:
                                # Get the original array dimensions from the variable properties
                                dp_props = master_cdf_file.varinq(dep_name)
                                if new_cdflib:
                                    orig_dimensions = dp_props.Dim_Sizes
                                else:
                                    orig_dimensions = dp_props['Dim_Sizes']
                                reshape_dim = tuple(orig_dimensions)
                                depend_3 = np.reshape(depend_3, reshape_dim)
                                #depend_3 = None
                                pass

                            dep_dims = depend_3.shape
                            dep_ndims = len(dep_dims)
                            if dep_ndims == 0:
                                logging.warning("Variable %s DEPEND_3 attribute %s is zero-dimensional. Removing attribute.", var,
                                                dep_name)
                                depend_3 = None
                            elif dep_ndims == 1:
                                # Not time varying
                                if dep_dims[0] != ydims[3]:
                                    logging.warning(
                                        "Variable %s DEPEND_3 attribute %s has length %d, but corresponding data dimension has length %d. Removing attribute.",
                                        var, dep_name, dep_dims[0], ydims[3])
                                    depend_3 = None
                            elif dep_ndims == 2:
                                # time-varying, or otherwise multidimensional
                                if dep_dims[0] != num_times:
                                    logging.warning(
                                        "Variable %s multidimensional DEPEND_3 attribute %s has %d elements in first dimension, but data has %d times. Removing attribute.",
                                        var, dep_name, dep_dims[0], num_times)
                                    depend_3 = None
                                if dep_dims[1] != ydims[3]:
                                    logging.warning(
                                        "Variable %s multidimensional DEPEND_3 attribute %s has %d elements in second dimension, but corresponding data dimension has length %d. Removing attribute.",
                                        var, dep_name, dep_dims[1], ydims[3])
                                    depend_3 = None
                            else:
                                # Too many dimensions
                                logging.warning(
                                    "Variable %s DEPEND_3 attribute %s has too many dimensions (%d). Removing attribute.",
                                    var, dep_name, dep_ndims)
                                depend_3 = None
                        except ValueError:
                            logging.warning('Unable to get DEPEND_3 variable %s while processing %s',
                                            var_atts["DEPEND_3"], var)

                nontime_varying_depends = []

                # Fill in any missing depend_n values (skipping this for now)
                ndims = len(ydata.shape)
                if ndims >= 2 and depend_1 is None:
                    # This is so common, we won't bother logging it
                    # depend_1 = np.arange(ydata.shape[1])
                    pass
                if ndims >= 3 and depend_2 is None:
                    #logging.warning("Variable %s has %d dimensions, but no DEPEND_2, adding index range for dimension 2", var_name, ndims )
                    #depend_2 = np.arange(ydata.shape[2])
                    pass
                if ndims >= 4 and depend_3 is None:
                    #logging.warning("Variable %s has %d dimensions, but no DEPEND_3, adding index range for dimension 3", var_name, ndims )
                    #depend_3 = np.arange(ydata.shape[3])
                    pass

                if depend_1 is not None and depend_2 is not None and depend_3 is not None:
                    tplot_data['v1'] = depend_1
                    tplot_data['v2'] = depend_2
                    tplot_data['v3'] = depend_3

                    if len(depend_1.shape) == 1:
                        nontime_varying_depends.append('v1')
                    if len(depend_2.shape) == 1:
                        nontime_varying_depends.append('v2')
                    if len(depend_3.shape) == 1:
                        nontime_varying_depends.append('v3')

                elif depend_1 is not None and depend_2 is not None:
                    tplot_data['v1'] = depend_1
                    tplot_data['v2'] = depend_2
                    if len(depend_1.shape) == 1:
                        nontime_varying_depends.append('v1')
                    if len(depend_2.shape) == 1:
                        nontime_varying_depends.append('v2')
                elif depend_1 is not None:
                    tplot_data['v'] = depend_1
                    if len(depend_1.shape) == 1:
                        nontime_varying_depends.append('v')
                elif depend_2 is not None:
                    tplot_data['v2'] = depend_2
                    if len(depend_2.shape) == 1:
                        nontime_varying_depends.append('v')

                metadata[var_name] = {'display_type': var_atts.get("DISPLAY_TYPE", "time_series"),
                                      'scale_type': var_atts.get("SCALE_TYP"),
                                      'y_spec_scale_type': None,
                                      'var_attrs': var_atts,
                                      'labels': None,
                                      'file_name': filename,
                                      'global_attrs': gatt}

                labl_ptr = var_atts.get('LABL_PTR_1')
                if labl_ptr is not None:
                    try:
                        labl_ptr_arr = master_cdf_file.varget(labl_ptr)
                        if labl_ptr_arr is not None:
                            metadata[var_name]['labels'] = labl_ptr_arr.flatten().tolist()
                    except:
                        pass

                units = filter_greater_than(var_atts.get('UNITS'))
                if units is None:
                    unit_ptr = var_atts.get('UNIT_PTR')
                    if unit_ptr is not None:
                        try:
                            unit_ptr_array = master_cdf_file.varget(unit_ptr)
                            if unit_ptr_array is not None:
                                units = filter_greater_than(unit_ptr_array.flatten().tolist())
                        except:
                            pass
                if isinstance(units, (list,np.ndarray)):
                    # If units is a list or array, and are all the same, replace with the single value
                    # Otherwise, stringify the whole array
                    firstunit=units[0].lower()
                    allsame=True
                    for u in units:
                        if u.lower() != firstunit:
                            allsame=False
                    if allsame:
                        # Return units as a scalar
                        metadata[var_name]['units'] = units[0]
                    else:
                        # Go ahead and stringify the whole mess to force it to bw a scalar
                        # TODO: there must be a better way to handle this!
                        logging.warning(f'Variable {var_name} in file {cdf_file} has non-homogeneous unit values {units}')
                        metadata[var_name]['units'] = str(units)
                else:
                     metadata[var_name]['units'] = str(units)

                if metadata[var_name]['scale_type'] is None:
                    alt_scale_type = var_atts.get("SCALETYP", "linear")
                    if alt_scale_type is not None:
                        metadata[var_name]['scale_type'] = alt_scale_type

                # handle y-axis options for spectra
                if 'DEPEND_1' in var_atts:
                    if isinstance(var_atts['DEPEND_1'], str):
                        try:
                            depend_1_var_atts = master_cdf_file.varattsget(var_atts['DEPEND_1'])

                            scale_type = depend_1_var_atts.get('SCALETYP')
                            if scale_type is None:
                                scale_type = depend_1_var_atts.get('SCALE_TYP')

                            if scale_type is not None:
                                metadata[var_name]['y_spec_scale_type'] = scale_type

                            depend_1_units = depend_1_var_atts.get('UNITS')

                            if depend_1_units is not None:
                                metadata[var_name]['y_spec_units'] = depend_1_units
                                metadata[var_name]['DEPEND_1_UNITS'] = depend_1_units
                        except ValueError:
                            pass

                # options for multidimensional variables
                if 'DEPEND_2' in var_atts:
                    if isinstance(var_atts['DEPEND_2'], str):
                        try:
                            depend_2_var_atts = master_cdf_file.varattsget(var_atts['DEPEND_2'])
                            depend_2_units = depend_2_var_atts.get('UNITS')
                            if depend_2_units is not None:
                                metadata[var_name]['DEPEND_2_UNITS'] = depend_2_units
                        except ValueError:
                            # some variables aren't actually available
                            pass
                if 'DEPEND_3' in var_atts:
                    if isinstance(var_atts['DEPEND_3'], str):
                        try:
                            depend_3_var_atts = master_cdf_file.varattsget(var_atts['DEPEND_3'])
                            depend_3_units = depend_3_var_atts.get('UNITS')
                            if depend_3_units is not None:
                                metadata[var_name]['DEPEND_3_UNITS'] = depend_3_units
                        except ValueError:
                            # some variables aren't actually available
                            pass

                # Check if the variable already exists in the for loop output
                if var_name not in output_table:
                    output_table[var_name] = tplot_data
                else:
                    # If it does, loop though the existing variable's x,y,v,v2,v3,etc
                    var_data = output_table[var_name]
                    for output_var in var_data:
                        if output_var not in nontime_varying_depends:
                            if np.asarray(tplot_data[output_var]).ndim == 0 and np.equal(tplot_data[output_var], None):
                                # If there is nothing in the new variable, then pass
                                pass
                            elif np.asarray(var_data[output_var]).ndim == 0 and np.equal(var_data[output_var], None):
                                # If there is nothing in the old variable, then replace
                                var_data[output_var] = tplot_data[output_var]
                            else:  # If they both have something, then concatenate
                                var_data[output_var] = np.concatenate((var_data[output_var], tplot_data[output_var]))

    if notplot:
        return output_table

    for var_name in output_table.keys():
        to_merge = False
        if var_name in pyspedas.tplot_tools.data_quants.keys() and merge:
            prev_data_quant = pyspedas.tplot_tools.data_quants[var_name]
            to_merge = True

        try:
            attr_dict = {}
            if metadata.get(var_name) is not None:
                attr_dict["CDF"] = {}
                attr_dict["CDF"]["VATT"] = metadata[var_name]['var_attrs']
                attr_dict["CDF"]["GATT"] = metadata[var_name]['global_attrs']
                attr_dict["CDF"]["FILENAME"] = metadata[var_name]['file_name']
                attr_dict["CDF"]["LABELS"] = metadata[var_name]['labels']

                # populate data_att; used by PySPEDAS as a common interface to
                # data attributes such as units, coordinate system, etc
                attr_dict["data_att"] = {"coord_sys": "",
                                         #"units": metadata[var_name]['var_attrs'].get('UNITS'),
                                         "units": metadata[var_name]['units'],
                                         "depend_1_units": metadata[var_name].get('DEPEND_1_UNITS'),
                                         "depend_2_units": metadata[var_name].get('DEPEND_2_UNITS'),
                                         "depend_3_units": metadata[var_name].get('DEPEND_3_UNITS')}

                # populate depend_1_units in data_att, if it's not set
                if attr_dict['data_att']['depend_1_units'] is None and metadata[var_name]['var_attrs'].get('UNITS') is not None:
                    attr_dict['data_att']['depend_1_units'] = metadata[var_name]['var_attrs'].get('UNITS')

                # extract the coordinate system, if available
                vatt_keys = list(attr_dict["CDF"]["VATT"].keys())
                vatt_lower = [k.lower() for k in vatt_keys]
                if 'coordinate_system' in vatt_lower:
                    attr_dict['data_att']['coord_sys'] = filter_greater_than(
                        attr_dict["CDF"]["VATT"][vatt_keys[vatt_lower.index('coordinate_system')]])

                if 'labels' in vatt_lower:
                    if attr_dict["CDF"]["VATT"].get('labels') is not None:
                        if isinstance(attr_dict["CDF"]["VATT"]['labels'], str):
                            # check for line separators
                            # this fixes the legend for RBSP L3 EFW data
                            if '\\n' in attr_dict["CDF"]["VATT"]['labels']:
                                attr_dict["CDF"]["VATT"]['labels'] = attr_dict["CDF"]["VATT"]['labels'].split('\\n')
                            if '\\N' in attr_dict["CDF"]["VATT"]['labels']:
                                attr_dict["CDF"]["VATT"]['labels'] = attr_dict["CDF"]["VATT"]['labels'].split('\\N')
            store_data(var_name, data=output_table[var_name], attr_dict=attr_dict)
        except (TypeError, ValueError) as err:
            logging.warning("Exception of type %s raised during store_data call for variable %s", str(type(err)), var_name)
            logging.warning("Exception message: %s",str(err))
            continue

        if var_name not in stored_variables:
            stored_variables.append(var_name)

        if metadata.get(var_name) is not None:
            if metadata[var_name]['display_type'].lower() == "spectrogram":
                options(var_name, 'spec', 1)
            if metadata[var_name]['scale_type'] == 'log':
                if metadata[var_name]['display_type'].lower() == "spectrogram":
                    options(var_name, 'zlog', 1)
                else:
                    options(var_name, 'ylog', 1)
            if metadata[var_name].get('y_spec_scale_type') is not None:
                if metadata[var_name]['y_spec_scale_type'] == 'log':
                    options(var_name, 'ylog', 1)
            if metadata[var_name].get('y_spec_units') is not None:
                options(var_name, 'ysubtitle', '[' + metadata[var_name].get('y_spec_units') + ']')
            if metadata[var_name].get('var_attrs') is not None:
                if metadata[var_name]['var_attrs'].get('LABLAXIS') is not None:
                    options(var_name, 'ytitle', metadata[var_name]['var_attrs']['LABLAXIS'])
                if metadata[var_name]['var_attrs'].get('UNITS') is not None:
                    unitsstr = filter_greater_than(metadata[var_name]['var_attrs']['UNITS'])
                    if metadata[var_name]['display_type'].lower() == 'spectrogram':
                        options(var_name, 'ztitle', f'[{unitsstr}]')
                    else:
                        options(var_name, 'ysubtitle', f'[{unitsstr}]')

            # Gather up all options in the variable attribute section, toss them into options and see what sticks
            # JWL 2025-03-06: We want options() to warn about unrecognized option names.  If used here, it will
            # spam the logs with warnings.  There is also the possibility that someone could use an attribute name
            # for some other purpose, that collides with the PySPEDAS interpretation.
            #options(var_name, opt_dict=metadata[var_name]['var_attrs'])

        if to_merge is True:
            cur_data_quant = pyspedas.tplot_tools.data_quants[var_name]
            if isinstance(pyspedas.tplot_tools.data_quants[var_name], dict):  # non-record varying variable, shouldn't be merged
                continue
            plot_options = copy.deepcopy(pyspedas.tplot_tools.data_quants[var_name].attrs)
            pyspedas.tplot_tools.data_quants[var_name] = xr.concat([prev_data_quant, cur_data_quant], dim='time').sortby('time')
            pyspedas.tplot_tools.data_quants[var_name].attrs = plot_options

    if notplot:
        return output_table

    if plot:
        tplot(stored_variables)

    return stored_variables



def filter_greater_than_single(attr):
    """
    Returns any text to the left of > in a variable attribute
    (e.g., coordinate systems, units)
    Assumes input is a single value
    """
    if not isinstance(attr, str):
        return attr
    return attr.split('>')[0].rstrip()

def filter_greater_than(attr):
    """
    Strip CDF comments from attribute values (single value or array/list of values)
    Returns any text to the left of '>'
    Args:
        attr:

    Returns: str

    """
    if isinstance(attr,str):
        return filter_greater_than_single(attr)
    elif isinstance(attr,Iterable):
        return list(map(filter_greater_than_single,attr))
    else:
        return filter_greater_than_single(attr)