Source code for pyspedas.projects.kyoto.load_dst

import logging
import re
from pyspedas.tplot_tools import time_double, store_data, options, time_clip as tclip
from pyspedas import download, dailynames
from .kyoto_config import CONFIG


def parse_dst_html(html_text, year=None, month=None):
    """
    Parses the HTML content to extract relevant information.

    Parameters
    ----------
    html_text : str
        The HTML content to parse.
    year : int, optional
        The year to consider while parsing the HTML content. If not provided, all years are considered.
    month : int, optional
        The month to consider while parsing the HTML content. If not provided, all months are considered.

    Returns
    -------
    dict
        A dictionary containing the parsed information.
    """
    times = []
    data = []
    # remove all of the HTML before the table
    html_data = html_text[html_text.find("Hourly Equatorial Dst Values") :]
    # remove all of the HTML after the table
    html_data = html_data[: html_data.find("<!-- vvvvv S yyyymm_part3.html vvvvv -->")]
    html_lines = html_data.split("\n")
    data_strs = html_lines[5:]
    # loop over days
    for day_str in data_strs:
        # the first element of hourly_data is the day, the rest are the hourly Dst values
        hourly_data = re.findall(r'[-+]?\d+', day_str)
        ## if the data is not complete for a whole day (which is typically the case for real time data):
        if len(hourly_data[1:]) != 24:
            ## if the data is not completely missing for a whole day:
            if len(hourly_data[1:]) != 3:
                for idx, dst_value in enumerate(hourly_data[1:]):
                    ## The kyoto website uses a 4 digit format.
                    remainder = len(dst_value) % 4
                    ## if the remainder is not zero, it can be either the regular case '-23' or
                    ## the ill case '-159999'. index by 0:remainder gives the correct -23 or -15
                    if remainder > 0:
                        times.append(
                            time_double(
                                year + "-" + month + "-" + hourly_data[0] + "/" + str(idx) + ":30"
                            )
                        )
                        data.append(float(dst_value[0:remainder]))
                    ## if the remainder is zero, it can be either the regular case '-1599999' or
                    ## the ill case '9999...9999' with the number of nine being the multiple of 4.
                    ## we further test if the first four digits are 9999. If not, we simply index by
                    ## [0:4], which gives -159 in the regular case. Else, we ignore missing data.
                    elif dst_value[0:4] != '9999':
                        times.append(
                            time_double(
                                year + "-" + month + "-" + hourly_data[0] + "/" + str(idx) + ":30"
                            )
                        )
                        data.append(float(dst_value[0:4]))
        ## if the data is complete for a whole day.
        else:
            for idx, dst_value in enumerate(hourly_data[1:]):
                times.append(
                    time_double(
                        year + "-" + month + "-" + hourly_data[0] + "/" + str(idx) + ":30"
                    )
                )
                data.append(float(dst_value))

    return (times, data)


[docs]
def dst(
    trange=None,
    datatypes=["final", "provisional", "realtime"],
    time_clip=True,
    remote_data_dir="http://wdc.kugi.kyoto-u.ac.jp/",
    prefix="",
    suffix="",
    no_download=False,
    local_data_dir="",
    download_only=False,
    force_download=False,
):
    """
    Loads Dst index data from the Kyoto servers.

    Parameters
    ----------
    trange : list of str, required
        Time range of interest with the format ['YYYY-MM-DD','YYYY-MM-DD'] or
        to specify more or less than a day ['YYYY-MM-DD/hh:mm:ss','YYYY-MM-DD/hh:mm:ss'].
    time_clip : bool, optional
        Time clip the variables to exactly the range specified in the trange keyword.
        Defaults to True.
    remote_data_dir : str, optional
        The remote directory from where to load the Dst index data.
        Defaults to "http://wdc.kugi.kyoto-u.ac.jp/".
    suffix : str, optional
        The tplot variable names will be given this suffix.
        By default, no suffix is added.
    force_download: bool
        Download file even if local version is more recent than server version
        Default: False

    Returns
    -------
    list of str
        List of tplot variables created.

    Notes
    -----
    There are three types of Dst data available: final, provisional, and realtime.
    Usually, only one type is available for a particular month.
    is function tries to download final data, if this is not available then
    it downloads provisional data, and if this is not available then it downloads
    realtime data.

    Examples
    --------
    >>> from pyspedas.projects.kyoto import dst
    >>> dst_data = dst(trange=['2015-01-01', '2015-01-02'])
    >>> print(dst_data)
    kyoto_dst
    """


    vars = []  # list of tplot variables created

    if trange is None or len(trange) != 2:
        logging.error("dst: Keyword trange with two datetimes is required to download data.")
        return vars
    trange_dbl = time_double(trange)
    earliest_data = time_double('1957-01-01')
    if trange_dbl[0] >= trange_dbl[1]:
        logging.error(f"dst: Invalid time range. End time {trange[1]} must be greater than start time {trange[0]}.")
        return vars
    if trange_dbl[1] < earliest_data:
        logging.error(f"dst: Invalid time range: specified end date {trange[1]} is earlier than 1957-01-01")
        return vars

    if local_data_dir == "":
        local_data_dir = CONFIG["local_data_dir"]
    if local_data_dir[-1] != "/":
        local_data_dir += "/"

    if remote_data_dir == "":
        remote_data_dir = CONFIG["remote_data_dir_dst"]

    try:
        file_names = dailynames(file_format="%Y%m/index.html", trange=trange)
    except Exception as e:
        logging.error("Error occurred while getting file names: " + str(e))
        return vars

    # Keep unique files names only
    file_names = list(set(file_names))

    ack = """
            ******************************
            The DST data are provided by the World Data Center for Geomagnetism, Kyoto, and
            are not for redistribution (http://wdc.kugi.kyoto-u.ac.jp/). Furthermore, we thank
            the geomagnetic observatories (Kakioka [JMA], Honolulu and San Juan [USGS], Hermanus
            [RSA], Alibag [IIG]), NiCT, INTERMAGNET, and many others for their cooperation to
            make the Dst index available.
            ******************************
        """

    times = []
    data = []
    datatypes = ["final", "provisional", "realtime"]

    # Final files
    for datatype in datatypes:

        for filename in file_names:
            yyyymm = ""
            if len(filename) > 6:
                yyyymm = filename[:6]
            url = remote_data_dir + "dst_" + datatype + "/" + filename
            local_path = local_data_dir + "dst_" + datatype + "/" + yyyymm + "/"
            fname = download(
                url, local_path=local_path, no_download=no_download, text_only=True, force_download=force_download,
            )

            if download_only:
                continue  # skip to the next file

            if fname is None or len(fname) < 1:
                logging.error("Error occurred while downloading: " + url)
                continue  # skip to the next file
            try:
                with open(fname[0], "r") as file:
                    html_text = file.read()
                file_times, file_data = parse_dst_html(
                    html_text, year=filename[:4], month=filename[4:6]
                )
                times.extend(file_times)
                data.extend(file_data)
            except Exception as e:
                logging.error(
                    "Error occurred while parsing " + filename + ": " + str(e)
                )
                continue  # skip to the next file

        # At this point, we have all data for one datatype
        # If we have data, we can break the loop, otherwise we try other datatypes
        if len(times) != 0:
            break

    if len(times) == 0:
        logging.error("No data found.")
        return vars

    varname = prefix + "kyoto_dst" + suffix
    store_data(varname, data={"x": times, "y": data})
    options(varname, "ytitle", "Dst (" + datatype + ")")
    vars.append(varname)

    if time_clip:
        tclip(varname, trange[0], trange[1], overwrite=True)

    logging.info(ack)

    return vars