Source code for pyspedas.projects.kyoto.load_ae

import logging
import re
from datetime import datetime, timedelta
from pyspedas.tplot_tools import time_clip as tclip
from pyspedas.tplot_tools import store_data, options
from pyspedas import dailynames, download
from .kyoto_config import CONFIG
from pyspedas import time_string,get_data, time_double, time_string, del_data, tnames
from pyspedas import tplot_copy
import numpy as np
import os

def ae_file_sort_keys(downloaded_files):
    keys = []
    for fn in downloaded_files:
        bn = os.path.basename(fn)  # like aeyymmdd
        yy = bn[2:4]
        mm = bn[4:6]
        dd = bn[6:8]
        if int(yy) >= 69:
            key = '19' + yy + mm + dd
        else:
            key = '20' + yy + mm + dd
        keys.append(key)
    return keys


def parse_ae_html(html_text):
    """
    Parses the HTML text and extracts date-times and values.

    Parameters
    ----------
    html_text : str
        The HTML text to be parsed.

    Returns
    -------
    tuple
        A tuple containing two lists - date_times and values.
        date_times : list
            A list of datetime objects representing the date and time.
        values : list
            A list of integers representing the AE values.
    """
    date_times = []
    values = []
    html_lines = html_text.split("\n")

    for line in html_lines:
        # AEALAOAU    100101L00AL PRVAE/E02
        match = re.match(
            r"A\w{7}\s+(\d{6})\w(\d{2})(AE|AL|AO|AU|AX)\s+\w{5,8}(?:/\w\d{2})?\s+(.*)",
            line)
        if match:
            date_str = match.group(1)
            hour_str = match.group(2)
            # Note: we are passing a 2-digit year to strptime!  Fortunately, the earliest date
            # for provisional AE data is 1996-01-01, and strptime treats any 2-digit year >= 69
            # as being in the 1900's, .so this is correct for our purposes.  JWL 2026-02-09
            date = datetime.strptime(date_str + hour_str, "%y%m%d%H")
            minute_values = match.group(4).split()
            for i, value in enumerate(minute_values):
                if i >= 60:
                    break  # the last values is the daily mean
                date_times.append(date + timedelta(minutes=i))
                values.append(int(value))
    return date_times, values


def load_ae_worker(
    trange=None,
    datatypes=["ae", "al", "ao", "au", "ax"],
    time_clip=True,
    remote_data_dir="",
    prefix="",
    suffix="",
    no_download=False,
    local_data_dir="",
    download_only=False,
    force_download=False,
    skip_realtime = False,
    skip_provisional = False
):
    """
    Downloads and loads either provisional or realtime index data (but not both) from the Kyoto World Data Center for Geomagnetism.

    Parameters
    ----------
    trange : list
        The time range of the data to download and load.
        Format: [start_time, end_time].
    datatypes : list, optional
        The types of AE index data to download and load.
        Default: ["ae", "al", "ao", "au", "ax"].
    time_clip : bool, optional
        Whether to clip the loaded data to the specified time range.
        Default: True.
    remote_data_dir : str, optional
        The remote directory where the data is located.
        Default: "https://wdc.kugi.kyoto-u.ac.jp/ae_provisional/".
    prefix : str, optional
        A prefix to add to the variable names of the loaded data.
        Default: "".
    suffix : str, optional
        A suffix to add to the variable names of the loaded data.
        Default: "".
    no_download : bool, optional
        If True, the data will not be downloaded, the local file will be used.
        Default: False.
    local_data_dir : str, optional
        The local directory where the data will be downloaded.
        Default: "".
    download_only : bool, optional
        If True, only download the data, do not load it.
        Default: False.
    force_download: bool
        Download file even if local version is more recent than server version
        Default: False
    skip_provisional: bool
        If Rrue, only attempt downloading and processing realtime data. Default: False
    skip_realtime: bool
        If True, only attempt downloading and processing provisional data. Default: False


    Returns
    -------
    list
        A list of tplot variable names created.

    """

    vars = []  # list of tplot variables created
    ack = """
            ******************************
            The provisional AE data are provided by the World Data Center for Geomagnetism, Kyoto,
            and are not for redistribution (https://wdc.kugi.kyoto-u.ac.jp/). Furthermore, we thank
            AE stations (Abisko [SGU, Sweden], Cape Chelyuskin [AARI, Russia], Tixi [IKFIA and
            AARI, Russia], Pebek [AARI, Russia], Barrow, College [USGS, USA], Yellowknife,
            Fort Churchill, Sanikiluaq (Poste-de-la-Baleine) [CGS, Canada], Narsarsuaq [DMI,
            Denmark], and Leirvogur [U. Iceland, Iceland]) as well as the RapidMAG team for
            their cooperations and efforts to operate these stations and to supply data for the provisional
            AE index to the WDC, Kyoto. (Pebek is a new station at geographic latitude of 70.09N
            and longitude of 170.93E, replacing the closed station Cape Wellen.)
            ******************************
        """

    if local_data_dir == "":
        local_data_dir = CONFIG["local_data_dir"]
    if local_data_dir[-1] != "/":
        local_data_dir += "/"

    if remote_data_dir == "":
        remote_data_dir = CONFIG["remote_data_dir_ae"]

    all_datatypes = ["ae", "al", "ao", "au", "ax"]
    for i in range(len(datatypes)):
        datatype = datatypes[i].lower()
        if datatype not in all_datatypes:
            logging.error("Invalid datatype: " + datatype)
            continue  # skip to the next datatype

        rt_ff = "%Y/%m/%d/" + datatype + "%y%m%d"
        prov_ff = "%Y%m/" + datatype + "%y%m%d.for.request"

        try:
            rt_file_names = dailynames(file_format=rt_ff, trange=trange)
            prov_file_names = dailynames(file_format=prov_ff, trange=trange)
        except Exception as e:
            logging.error("Error occurred while getting file names: " + str(e))
            continue  # skip to the next datatype

        # Keep unique files names only
        rt_file_names = list(set(rt_file_names))
        prov_file_names = list(set(prov_file_names))
        # The above operation does not necessarily preserve order!  We need to sort the filenames.
        rt_file_names = sorted(rt_file_names)
        prov_file_names = sorted(prov_file_names)

        times = []
        data = []
        rt_downloaded_files = []
        prov_downloaded_files = []
        # Download the html page for each url and extract times and data
        if not skip_provisional:
            for filename in prov_file_names:
                yyyymm = ""
                if len(filename) > 6:
                    yyyymm = filename[:6]
                url = remote_data_dir + "ae_provisional/" + filename
                # Use a local path similar to IDL SPEDAS
                local_path = local_data_dir + "ae_provisional/" + datatype + "/" + yyyymm + "/"
                fname = download(url, local_path=local_path, no_download=no_download, force_download=force_download)
                if fname is not None and len(fname) >= 1:
                    prov_downloaded_files.extend(fname)

        if not skip_realtime:
            for filename in rt_file_names:
                yyyy = filename[:4]
                mm = filename[5:7]
                dd = filename[8:10]
                url = remote_data_dir + "ae_realtime/data_dir/" + filename
                # Use a local path similar to IDL SPEDAS
                local_path = local_data_dir + "ae_realtime/data_dir/" + datatype + "/" + yyyy + "/" + mm + "/" + dd + "/"

                fname = download(url, local_path=local_path, no_download=no_download, force_download=force_download)
                if fname is not None and len(fname) >= 1:
                    rt_downloaded_files.extend(fname)

        if download_only:
            continue  # skip to the next data type

        if len(rt_downloaded_files) == 0 and len(prov_downloaded_files) == 0:
            logging.warning(f"No data found in requested time range for datatype {datatype}")
            continue

        # Merge prov and realtime file lists
        # We need to use YYYYMMDD as sort/duplicate keys.  The filenames only have two digit years
        # so we need to infer (guess) the century.  Fortunately, the realtime and provisional
        # file basenames are close enough that we can use the same routine to generate sort
        # keys for both.

        rt_keys = ae_file_sort_keys(rt_downloaded_files)
        prov_keys = ae_file_sort_keys(prov_downloaded_files)

        # Check for duplicates...if found, take provisional over realtime
        rt_thinned = []
        rt_keys_thinned = []
        for k,f in zip(rt_keys, rt_downloaded_files):
            if k in prov_keys:
                logging.warning(f"Both realtime and provional {datatype} data found for date {k}, using provisional")
            else:
                rt_thinned.append(f)
                rt_keys_thinned.append(k)

        combined_keys=prov_keys + rt_keys_thinned
        combined_filenames=prov_downloaded_files + rt_thinned

        # Now sort the keys and filenames with some python zip magic
        step1=zip(combined_keys, combined_filenames)
        step2=sorted(step1)
        sorted_keys_tuple, sorted_filenames_tuple = zip(*step2)
        sorted_keys = list(sorted_keys_tuple)
        sorted_filenames = list(sorted_filenames_tuple)

        # Now iterate through the time-sorted file list, load and combine the data.

        for fname in sorted_filenames:
            try:
                with open(fname, "r") as file:
                    html_text = file.read()
                file_times, file_data = parse_ae_html(html_text)
                times.extend(file_times)
                data.extend(file_data)
            except Exception as e:
                logging.error(
                    "Error occurred while parsing " + fname + ": " + str(e)
                )
                continue  # skip to the next file

        if len(times) == 0 and not download_only:
            logging.error("No data found for " + datatype + " in the given time range.")
            continue
        if not download_only:
            # Save tplot variables
            varname = prefix + "kyoto_" + datatype + suffix
            attr = {
                "source_name": "Kyoto Provisonal",
                "instrument": datatype.upper(),
                "reference": "https://wdc.kugi.kyoto-u.ac.jp/",
                "units": "nT",
                "acknowledgement": ack,
            }
            store_data(varname, data={"x": times, "y": data}, attr_dict=attr)
            options(varname, "ytitle", "Kyoto " + datatype.upper())
            options(varname, "ysubtitle", "[nT]")

            vars.append(varname)

            if time_clip:
                tclip(varname, trange[0], trange[1], overwrite=True)

    return vars



[docs]
def load_ae(
        trange=None,
        datatypes=["ae", "al", "ao", "au", "ax"],
        time_clip=True,
        remote_data_dir="",
        prefix="",
        suffix="",
        no_download=False,
        local_data_dir="",
        download_only=False,
        force_download=False,
        realtime=False,
):
    """
    Downloads and loads data from the Kyoto World Data Center for Geomagnetism.
    The realtime and provisional data sets are organized a little differently, so if necessary,
    this routine makes two calls to load_ae_worker (one for provisional, one for realtime) and
    combines the results.

    Parameters
    ----------
    trange : list
        The time range of the data to download and load.
        Format: [start_time, end_time].
    datatypes : list, optional
        The types of AE index data to download and load.
        Default: ["ae", "al", "ao", "au", "ax"].
    time_clip : bool, optional
        Whether to clip the loaded data to the specified time range.
        Default: True.
    remote_data_dir : str, optional
        The remote directory where the data is located.
        Default: "https://wdc.kugi.kyoto-u.ac.jp/ae_provisional/".
    prefix : str, optional
        A prefix to add to the variable names of the loaded data.
        Default: "".
    suffix : str, optional
        A suffix to add to the variable names of the loaded data.
        Default: "".
    no_download : bool, optional
        If True, the data will not be downloaded, the local file will be used.
        Default: False.
    local_data_dir : str, optional
        The local directory where the data will be downloaded.
        Default: "".
    download_only : bool, optional
        If True, only download the data, do not load it.
        Default: False.
    force_download: bool
        Download file even if local version is more recent than server version
        Default: False

    Returns
    -------
    list
        A list of tplot variable names created.

    Examples
    --------
    >>> from pyspedas.projects.kyoto import load_ae
    >>> trange = ['2010-01-01 12:00:00', '2010-01-01 18:00:00']
    >>> vars = load_ae(trange=trange)
    >>> print(vars)
    ['kyoto_ae', 'kyoto_al', 'kyoto_ao', 'kyoto_au', 'kyoto_ax']
    """

    vars = []  # list of tplot variables created

    if trange is None or len(trange) != 2:
        logging.error("load_ae: Keyword trange with two datetimes is required to download data.")
        return vars
    trange_dbl = time_double(trange)
    earliest_data = time_double('1996-01-01')
    if trange_dbl[0] >= trange_dbl[1]:
        logging.error(f"load_ae: Invalid time range. End time {trange[1]} must be greater than start time {trange[0]}.")
        return vars
    if trange_dbl[1] < earliest_data:
        logging.error(f"load_ae: Invalid time range. End time {trange[1]} is earlier than 1996-01-01")
        return vars

    if local_data_dir == "":
        local_data_dir = CONFIG["local_data_dir"]
    if local_data_dir[-1] != "/":
        local_data_dir += "/"

    if remote_data_dir == "":
        remote_data_dir = CONFIG["remote_data_dir_ae"]

    ack = """
            ******************************
            The provisional AE data are provided by the World Data Center for Geomagnetism, Kyoto,
            and are not for redistribution (https://wdc.kugi.kyoto-u.ac.jp/). Furthermore, we thank
            AE stations (Abisko [SGU, Sweden], Cape Chelyuskin [AARI, Russia], Tixi [IKFIA and
            AARI, Russia], Pebek [AARI, Russia], Barrow, College [USGS, USA], Yellowknife,
            Fort Churchill, Sanikiluaq (Poste-de-la-Baleine) [CGS, Canada], Narsarsuaq [DMI,
            Denmark], and Leirvogur [U. Iceland, Iceland]) as well as the RapidMAG team for
            their cooperations and efforts to operate these stations and to supply data for the provisional
            AE index to the WDC, Kyoto. (Pebek is a new station at geographic latitude of 70.09N
            and longitude of 170.93E, replacing the closed station Cape Wellen.)
            ******************************
        """
    earliest_mixed_date = "2024-05-10/00:00:00"
    latest_mixed_date = "2024-05-15/00:00:00"
    latest_provisional_only = "2021-01-01/00:00:00"
    skip_realtime = False
    skip_provisional = False
    if time_double(trange[1]) <= time_double(latest_provisional_only):
        skip_realtime = True
    elif time_double(trange[0]) >= time_double(latest_mixed_date):
        skip_provisional = True

    retvalue = load_ae_worker(trange=trange,
                              datatypes=datatypes,
                              time_clip=time_clip,
                              remote_data_dir=remote_data_dir,
                              prefix=prefix,
                              suffix=suffix,
                              no_download=no_download,
                              local_data_dir=local_data_dir,
                              force_download=force_download,
                              skip_realtime=skip_realtime,
                              skip_provisional=skip_provisional,
                              )
    # Print the acknowledgments
    logging.info(ack)
    return retvalue