Source code for pyspedas.analysis.avg_data

"""
Creates a new tplot variable as the time average of original.

    Notes
    -----
    Similar to avg_data.pro in IDL SPEDAS.

"""
import logging
import numpy as np
from pyspedas.tplot_tools import store_data, get_data, tnames, time_double



[docs]
def avg_data(names, trange=None, res=None, width=None,
             newname=None, suffix=None, overwrite=False):
    """
    Get a new tplot variable with averaged data.

    Parameters
    ----------
    names: str/list of str
        List of pytplot names.
    trange: list of float, optional
        Start time, end time.
        If empty, the data start and end time will be used.
    res: float, optional
        Time resolution in seconds for averaging data.
        It can be less than 1 sec.
        Default is 60 sec.
    width: int, optional
        Number of values for the averaging window.
        If res is set, then width is ignored.
    newname: str/list of str, optional
        List of new names for tplot variables.
        If not given, then a suffix is applied.
    suffix: str, optional
        A suffix to apply.
        Default is '-avg'.
    overwrite: bool, optional
        Replace the existing tplot name.
        Default is False.

    Returns
    -------
    n_names: str/list of str
        List of new pytplot names.

    """

    old_names = tnames(names)

    if res is None and width is None:
        res = 60  # Default is 60 sec

    if names is None or len(old_names) < 1:
        logging.error('avg_data: No valid tplot names were provided.')
        return

    if suffix is None:
        # IDL SPEDAS default suffix is '_avg'
        suffix = '-avg'

    if overwrite:
        n_names = old_names
    elif newname is None:
        n_names = [s + suffix for s in old_names]
    else:
        n_names = newname

    if isinstance(n_names, str):
        n_names = [n_names]

    if len(n_names) != len(old_names):
        n_names = [s + suffix for s in old_names]

    for old_idx, old in enumerate(old_names):
        new = n_names[old_idx]

        # Get times and data
        d = get_data(old)
        metadata = get_data(old, metadata=True)

        time = d[0]
        time = np.array(time_double(time))
        time_len = len(time)

        data = np.array(d[1])
        dim = data.shape
        dim0 = dim[0]
        if dim0 != time_len:
            logging.error('avg_data: Data and time length mismatch.')
            continue
        if len(dim) < 2:
            dim1 = 1
        else:
            dim1 = dim[1]

        # Data may also contain v, v1, v2, v3
        process_energies = []
        retain_energies = []
        for i, lend in enumerate(d):
            if i > 1:
                if len(lend) == len(time):
                    process_energies.append(i)
                else:
                    # These will retained in the results as-is
                    retain_energies.append(i)

        process_v = {}
        for i in process_energies:
            process_v[d._fields[i]] = []

        # Find start and end times
        if trange is not None and len(trange) == 2 and trange[0] < trange[1]:
            time_start = time_double(trange[0])
            time_end = time_double(trange[1])
        elif res is not None:
            # Define start and end times similar to IDL
            time_start = np.floor(time[0] / res) * res
            time_end = np.floor(time[-1] / res + 1) * res
        else:
            time_start = time[0]
            time_end = time[-1]

        # Check for empty set
        count_in_range = len(time[(time >= time_start) & (time <= time_end)])
        if time_end <= time_start or count_in_range < 2:
            logging.error('avg_data: No time values in provided time range.')
            continue

        # Find time bins
        time_duration = time_end - time_start

        if res is not None:
            # Given the resolution, compute bins
            dt = res
            bin_count = int(time_duration / dt)
            ind = np.floor((time - time_start) / dt)
        else:
            # Given the width, compute bins
            bins = np.arange(count_in_range)
            ind = np.floor(bins / width)
            bin_count = int(count_in_range / width)
            dt = time_duration / bin_count

        if bin_count < 2:
            msg = 'avg_data: too few bins. Bins=' + str(bin_count) \
                + ', Data points=' + str(count_in_range)
            logging.error(msg)
            continue

        # Split time into bins
        mdt = (time_end - time_start) / dt
        if (mdt - int(mdt) >= 0.5):
            max_ind = np.ceil(mdt)
        else:
            max_ind = np.floor(mdt)
        w1 = np.asarray(ind < 0).nonzero()
        ind[w1] = -1
        w2 = np.asarray(ind >= max_ind).nonzero()
        ind[w2] = -1

        # Find new times
        mx = np.max(ind) + 1
        new_times = (np.arange(mx) + 0.5) * dt + time_start

        # Find new data
        new_data = []
        for i in range(int(max_ind)):
            if i < 0:
                continue

            idx0 = np.asarray(ind == i).nonzero()
            isempty = True if len(idx0) < 1 else False

            if dim1 < 2:
                nd0 = np.nan if isempty else np.nanmean(data[idx0])
            else:
                nd0 = []
                for j in range(dim1):
                    tmp = np.nan if isempty else np.nanmean(data[idx0, j])
                    nd0.append(tmp)

            new_data.append(nd0)

            for ii in process_energies:
                # The following processes v, v1, v2, v3
                dime1 = len(d[ii][0])
                if dime1 < 2:
                    nd1 = np.nan if isempty else np.nanmean(d[ii][idx0])
                else:
                    nd1 = []
                    for j in range(dime1):
                        tmp = np.nan if isempty else np.nanmean(d[ii][idx0, j])
                        nd1.append(tmp)
                process_v[d._fields[ii]].append(nd1)

        # Create the new tplot variable
        data_dict = {'x': new_times, 'y': new_data}
        for i in retain_energies:
            data_dict[d._fields[i]] = d[i]
        for i in process_energies:
            data_dict[d._fields[i]] = process_v[d._fields[i]]

        store_data(new, data=data_dict, attr_dict=metadata)

        logging.info('avg_data was applied to: %s', new)

    return n_names