Source code for pyspedas.cdagui_tools.cdaweb

"""
Get information and download files from CDAWeb using cdasws.

For cdasws documentation, see:
    https://pypi.org/project/cdasws/
    https://cdaweb.gsfc.nasa.gov/WebServices/REST/py/cdasws/index.html

"""

import logging
import os
import re
from cdasws import CdasWs
from pyspedas.tplot_tools import cdf_to_tplot, netcdf_to_tplot, time_clip as tclip
from pyspedas.utilities.download import download
from pyspedas.cdagui_tools.config import CONFIG



[docs]
class CDAWeb:
    """Get information and download files from CDAWeb using cdasws."""

    def __init__(self):
        """Initialize."""
        self.cdas = CdasWs(endpoint=CONFIG["cdas_endpoint"])


[docs]
    def get_observatories(self):
        """Return a list of strings CDAWeb uses to designate missions or mission groups

        Examples
        --------

        >>> from pyspedas import CDAWeb
        >>> cdaweb_obj = CDAWeb()
        >>> obs_names = cdaweb_obj.get_observatories()
        """
        observatories = self.cdas.get_observatory_groups()
        onames = []
        for mission in observatories:
            mission_name = mission["Name"].strip()
            if len(mission_name) > 1 and mission_name != "(null)":
                onames.append(mission_name)
        return onames



[docs]
    def get_instruments(self):
        """Return a list of strings CDAWeb uses to designate instrument or dataset types.

        Examples
        --------

        >>> from pyspedas import CDAWeb
        >>> cdaweb_obj = CDAWeb()
        >>> obs_names = cdaweb_obj.get_instruments()
        """
        instruments = self.cdas.get_instrument_types()
        inames = []
        for instrument in instruments:
            instr_name = instrument["Name"].strip()
            if len(instr_name) > 1 and instr_name != "(null)":
                inames.append(instr_name)
        return inames


    def clean_time_str(self, t):
        """Remove the time part from datetime variable."""
        t0 = re.sub("T.+Z", "", t)
        return t0


[docs]
    def get_datasets(self, mission_list, instrument_list):
        """Return a list of datasets recognized by CDAWeb, given lists of missions and instruments.

        Parameters
        ----------
        mission_list: list of str
            List of mission names, as obtained from get_observatories()
        instrument_list: list of str
            List of instrument names, as obtained from get_instruments()

        Returns
        -------
        list of str
            A list of available datasets for the given missions and instruments.

        Examples
        --------

        >>> from pyspedas import CDAWeb
        >>> cdaweb_obj = CDAWeb()
        >>> dataset_list = cdaweb_obj.get_datasets(['ARTEMIS'],['Electric Fields (space)'])

        """
        thisdict = {"observatoryGroup": mission_list, "instrumentType": instrument_list}
        datasets = self.cdas.get_datasets(**thisdict)
        dnames = []
        for dataset in datasets:
            data_item = dataset["Id"].strip()
            if len(data_item) > 0 and data_item != "(null)":
                tinterval = dataset["TimeInterval"]
                t1 = tinterval["Start"].strip()
                t2 = tinterval["End"].strip()
                t1 = self.clean_time_str(t1)
                t2 = self.clean_time_str(t2)
                data_item += " (" + t1 + " to " + t2 + ")"
            dnames.append(data_item)
        return dnames



[docs]
    def get_filenames(self, dataset_list, t0, t1):
        """Return a list of urls for a dataset between dates t0 and t1.

        Example: get_files(['THB_L2_FIT (2007-02-26 to 2020-01-17)'],
            '2010-01-01 00:00:00', '2010-01-10 00:00:00')

        Parameters
        ----------
        dataset_list: list of str
            A list of dataset names, as obtained from get_datasets()
        t0: str
            Start time for data to be retrieved
        t1: str
            End time for data to be retrieved

        Returns
        -------
        list of str
            A list of URLs for the given dataset and time range

        Examples
        --------

        >>> from pyspedas import CDAWeb
        >>> cdaweb_obj = CDAWeb()
        >>> urllist = cdaweb_obj.get_filenames(['THB_L2_FIT (2007-02-26 to 2020-01-17)'], '2010-01-01 00:00:00', '2010-01-10 00:00:00')
        """
        remote_url = []

        # Set times to cdas format
        t0 = t0.strip().replace(" ", "T", 1)
        if len(t0) == 10:
            t0 += "T00:00:01Z"
        elif len(t0) > 10:
            t0 += "Z"
        t1 = t1.strip().replace(" ", "T", 1)
        if len(t1) == 10:
            t1 += "T23:23:59Z"
        elif len(t1) > 10:
            t1 += "Z"

        # For each dataset, find the url of files
        for d in dataset_list:
            d0 = d.split(" ")
            if len(d0) > 0:
                status, result = self.cdas.get_data_file(d0[0], [], t0, t1)
                if status == 200 and (result is not None):
                    r = result.get("FileDescription")
                    if r is not None:
                        for f in r:
                            remote_url.append(f.get("Name"))
        return remote_url



[docs]
    def cda_download(
        self,
        remote_files,
        local_dir=None,
        download_only=False,
        varformat=None,
        get_support_data=False,
        prefix="",
        suffix="",
        varnames=None,
        notplot=False,
        merge=False,
        trange=None,
        time_clip=False,
        force_download=False,
    ):
        """Download data files and (by default) load the data into tplot variables

        Parameters
        ----------
        remote_files : list of str
            List of remote file URLs, as obtained from function get_datasets().
        local_dir : str
            Local directory to save the data in.
        download_only : bool
            If True, download the data, but do not load it into tplot variables.
        varformat: str
            If set, specifies a pattern for which CDF or NetCDF variables to load.
        get_support_data: bool
            If True, load CDF variables marked as 'support_data'.
        prefix: str
            If set, prepend this string to the variable name when creating the tplot variables.
        suffix: str
            If set, append this string to the variable name when creating the tplot variables.
        varnames: list of str
            If set, specifies a list of variables to load from the data files.
            If None or [] or ['*'], load all variables.
        notplot: bool
            If True, return data directly as tplot data structures, rather than a list of tplot names.
        merge: bool
            If True, merge the data with existing tplot variables.
            If False (the default), overwrite existing tplot variables.
        trange: list of str
            If set, clip the time range of the data to these values.
        time_clip: bool
            If True, clip the time range of the data to the values in trange.
        force_download: bool
            If True, download the data even if it already exists locally.

        Returns
        -------
        tuple
            A tuple containing the number of files downloaded, the number of variables loaded, and a list of the tplot variables loaded.

        Examples
        --------
        >>> from pyspedas import CDAWeb
        >>> from pyspedas import tplot
        >>> cdaweb_obj = CDAWeb()
        >>> urllist = cdaweb_obj.get_filenames(['THB_L2_FIT (2007-02-26 to 2020-01-17)'], '2010-01-01 00:00:00', '2010-01-10 00:00:00')
        >>> result = cdaweb_obj.cda_download(urllist,local_dir="/tmp")
        >>> tplot('thb_fgs_gsm')
        """

        # Return quantities
        no_of_files = 0
        no_of_variables = 0
        loaded_vars = []

        # Set the local and remote directories
        remotehttp = CONFIG["remote_data_dir"]
        if local_dir is None:
            local_dir = CONFIG["local_data_dir"]

        cdf_files = []
        netcdf_files = []
        all_files = []

        # Download the files
        for remotef in remote_files:
            f = remotef.strip().replace(remotehttp, "", 1)
            localf = os.path.normpath(local_dir + os.path.sep + f)
            localfiles = download(
                remote_file=remotef,
                local_file=localf,
                force_download=force_download,
            )
            if localfiles is None:
                continue
            for f in localfiles:
                if f is not None and len(f) > 0:
                    all_files.append(os.path.normpath(f))

        no_of_files = len(all_files)
        if no_of_files > 0:

            # Sort the file list
            all_files = list(set(all_files))
            all_files.sort()

            # Load the data into tplot variables
            if not download_only:
                # Separate cdf and netcdf files. All other files cannot be loaded into tplot.
                for f in all_files:
                    if f.endswith(".cdf"):
                        cdf_files.append(f)
                    elif f.endswith(".nc"):
                        netcdf_files.append(f)
                    else:
                        logging.warning("File type not supported: %s", f)

                if len(cdf_files) > 0:
                    cdf_files.sort()
                    logging.info("Downloaded %d CDF files.", len(cdf_files))
                    try:
                        cdf_vars = cdf_to_tplot(
                            cdf_files,
                            prefix=prefix,
                            suffix=suffix,
                            get_support_data=get_support_data,
                            varformat=varformat,
                            varnames=varnames,
                            notplot=notplot,
                            merge=merge,
                        )
                        if cdf_vars is not None:
                            loaded_vars.extend(cdf_vars)
                    except ValueError as err:
                        msg = "cdf_to_tplot could not load " + str(cdf_files)
                        msg += "\n\n"
                        msg += "Error from pytplot: " + str(err)
                        logging.error(msg)

                if len(netcdf_files) > 0:
                    netcdf_files.sort()
                    logging.info("Downloaded %d NetCDF files.", len(netcdf_files))
                    try:
                        netcdf_vars = netcdf_to_tplot(
                            netcdf_files,
                            prefix=prefix,
                            suffix=suffix,
                            merge=merge,
                        )
                        if netcdf_vars is not None:
                            loaded_vars.extend(netcdf_vars)
                    except ValueError as err:
                        msg = "netcdf_to_tplot could not load " + str(netcdf_files)
                        msg += "\n\n"
                        msg += "Error from pytplot: " + str(err)
                        logging.error(msg)

                loaded_vars = list(set(loaded_vars))
                no_of_variables = len(loaded_vars)
                logging.info("Number of tplot variables loaded: %d", no_of_variables)

                if time_clip and trange is not None:
                    if trange[0] >= trange[1]:
                        logging.warning(
                            "trange values equal or out of order, no time clipping performed"
                        )
                    else:
                        tclip(
                            loaded_vars, trange[0], trange[1], suffix="", overwrite=True
                        )
                elif time_clip:
                    logging.warning("Warning: No trange specified for time_clip")

        return (no_of_files, no_of_variables, loaded_vars)