Source code for spaceweather.gfz

# Copyright (c) 2020--2024 Stefan Bender
#
# This module is part of pyspaceweather.
# pyspaceweather is free software: you can redistribute it or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 2.
# See accompanying COPYING.GPLv2 file or http://www.gnu.org/licenses/gpl-2.0.html.
"""Python interface for space weather indices from GFZ Potsdam

GFZ space weather indices ASCII file parser for python [#]_.

.. [#] https://kp.gfz-potsdam.de/en/
"""
import os
from pkg_resources import resource_filename
import logging
from warnings import warn

import numpy as np
import pandas as pd

from .core import _assert_file_exists, _dl_file

__all__ = [
	"gfz_daily", "gfz_3h", "read_gfz",
	"get_gfz_age", "update_gfz",
	"GFZ_PATH_ALL", "GFZ_PATH_30D",
]

GFZ_URL_ALL = "https://kp.gfz-potsdam.de/app/files/Kp_ap_Ap_SN_F107_since_1932.txt"
GFZ_URL_30D = "https://kp.gfz-potsdam.de/app/files/Kp_ap_Ap_SN_F107_nowcast.txt"
GFZ_FILE_ALL = os.path.basename(GFZ_URL_ALL)
GFZ_FILE_30D = os.path.basename(GFZ_URL_30D)
GFZ_PATH_ALL = resource_filename(__name__, os.path.join("data", GFZ_FILE_ALL))
GFZ_PATH_30D = resource_filename(__name__, os.path.join("data", GFZ_FILE_30D))



[docs]
def get_gfz_age(gfzpath, relative=True):
	"""Age of the downloaded data file

	Retrieves the last update time of the given file or full path.

	Parameters
	----------
	gfzpath: str
		Filename to check, absolute path or relative to the current dir.
	relative: bool, optional, default True
		Return the file's age (True) or the last update time (False).

	Returns
	-------
	upd: pandas.Timestamp or pandas.Timedelta
		The last updated time or the file age, depending on the setting
		of `relative` above.
		Raises ``IOError`` if the file is not found.
	"""
	_assert_file_exists(gfzpath)
	with open(gfzpath) as fp:
		for line in fp:
			# forward to last line
			pass
	upd = pd.to_datetime(line[:10].replace(" ", "-"), utc=True)
	if relative:
		return pd.Timestamp.utcnow() - upd
	return upd




[docs]
def update_gfz(
	min_age="1d",
	gfzpath_all=GFZ_PATH_ALL, gfzpath_30d=GFZ_PATH_30D,
	url_all=GFZ_URL_ALL, url_30d=GFZ_URL_30D,
):
	"""Update the local space weather index data

	Updates the local space weather index data from the website [#]_,
	given that the 30-day file is older than `min_age`,
	or the combined (large) file is older than 30 days.
	If the data is missing for some reason, a download will be attempted nonetheless.

	All arguments are optional and changing them from the defaults should
	neither be necessary nor is it recommended.

	.. [#] https://kp.gfz-potsdam.de/en/

	Parameters
	----------
	min_age: str, optional, default "1d"
		The time after which a new download will be attempted.
		The online data is updated every day, thus setting this value to
		a shorter time is not needed and not recommended.
	gfzpath_all: str, optional, default depending on package install location
		Filename for the large combined index file including the
		historic data, absolute path or relative to the current dir.
	gfzpath_30d: str, optional, default depending on package install location
		Filename for the 30-day (nowcast) index file, absolute path or relative
		to the current dir.
	url_all: str, optional, default `gfz.GFZ_URL_ALL`
		The url of the "historic" data file.
	url_30d: str, optional, default `gfz.GFZ_URL_30D`
		The url of the data file containing the indices for the last 30 days.

	Returns
	-------
	Nothing.
	"""
	def _update_file(gfzpath, url, min_age):
		if not os.path.exists(gfzpath):
			logging.info("{0} not found, downloading.".format(gfzpath))
			_dl_file(gfzpath, url)
			return
		if get_gfz_age(gfzpath) < pd.Timedelta(min_age):
			logging.info("not updating '{0}'.".format(gfzpath))
			return
		logging.info("updating '{0}'.".format(gfzpath))
		_dl_file(gfzpath, url)

	# Update the large file after 30 days
	_update_file(gfzpath_all, url_all, "30days")
	# Don't re-download before `min_age` has passed (1d)
	_update_file(gfzpath_30d, url_30d, min_age)




[docs]
def read_gfz(gfzpath):
	"""Read and parse space weather index data file

	Reads the given file and parses it according to the space weather data format.

	Parameters
	----------
	gfzpath: str
		File to parse, absolute path or relative to the current dir.

	Returns
	-------
	gfz_df: pandas.DataFrame
		The parsed space weather data (daily values).
		Raises an ``IOError`` if the file is not found.

		The dataframe contains the following columns:

		"year", "month", "day":
			The observation date
		"bsrn":
			Bartels Solar Rotation Number.
			A sequence of 27-day intervals counted continuously from 1832 Feb 8.
		"rotd":
			Number of Day within the Bartels 27-day cycle (01-27).
		"Kp0", "Kp3", "Kp6", "Kp9", "Kp12", "Kp15", "Kp18", "Kp21":
			Planetary 3-hour Range Index (Kp) for 0000-0300, 0300-0600,
			0600-0900, 0900-1200, 1200-1500, 1500-1800, 1800-2100, 2100-2400 UT
		"Kpsum": Sum of the 8 Kp indices for the day.
			Expressed to the nearest third of a unit.
		"Ap0", "Ap3", "Ap6", "Ap9", "Ap12", "Ap15", "Ap18", "Ap21":
			Planetary Equivalent Amplitude (Ap) for 0000-0300, 0300-0600,
			0600-0900, 0900-1200, 1200-1500, 1500-1800, 1800-2100, 2100-2400 UT
		"Apavg":
			Arithmetic average of the 8 Ap indices for the day.
		"isn":
			International Sunspot Number.
			Records contain the Zurich number through 1980 Dec 31 and the
			International Brussels number thereafter.
		"f107_obs":
			Observed (unadjusted) value of F10.7.
		"f107_adj":
			10.7-cm Solar Radio Flux (F10.7) Adjusted to 1 AU.
			Measured at Ottawa at 1700 UT daily from 1947 Feb 14 until
			1991 May 31 and measured at Penticton at 2000 UT from 1991 Jun 01 on.
			Expressed in units of 10-22 W/m2/Hz.
		"D":
			Definitive indicator.
			0: Kp and SN preliminary
			1: Kp definitive, SN preliminary
			2: Kp and SN definitive
	"""
	_assert_file_exists(gfzpath)
	gfz = np.genfromtxt(
		gfzpath,
		skip_header=3,
		delimiter=[
		#  yy mm dd dd dm br db kp kp kp kp kp kp kp kp
			4, 3, 3, 6, 8, 5, 3, 7, 7, 7, 7, 7, 7, 7, 7,
		#  ap ap ap ap ap ap ap ap Ap sn f1 f2 def
			5, 5, 5, 5, 5, 5, 5, 5, 6, 4, 9, 9, 2,
		],
		dtype=(
			"i4,i4,i4,i4,f4,i4,i4,f4,f4,f4,f4,f4,f4,f4,"
			"f4,i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,f8,f8,i4,"
		),
		names=[
			"year", "month", "day", "days", "days_m", "bsrn", "rotd",
			"Kp0", "Kp3", "Kp6", "Kp9", "Kp12", "Kp15", "Kp18", "Kp21",
			"Ap0", "Ap3", "Ap6", "Ap9", "Ap12", "Ap15", "Ap18", "Ap21", "Apavg",
			"isn", "f107_obs", "f107_adj", "D",
		]
	)
	gfz = gfz[gfz["year"] != -1]
	ts = pd.to_datetime([
		"{0:04d}-{1:02d}-{2:02d}".format(yy, mm, dd)
		for yy, mm, dd in gfz[["year", "month", "day"]]
	])
	gfz_df = pd.DataFrame(gfz, index=ts)
	# Sum Kp for compatibility with celestrak dataframe
	kpns = list(map("Kp{0}".format, range(0, 23, 3)))
	gfz_df.insert(15, "Kpsum", gfz_df[kpns].sum(axis=1))
	return gfz_df



def read_gfz_wdc(gfzpath):
	"""Parse space weather index data file in WDC format

	Parses the GFZ index data in WDC format.

	Parameters
	----------
	gfzpath: str
		File to parse, absolute path or relative to the current dir.

	Returns
	-------
	gfz_df: pandas.DataFrame
		The parsed space weather data (daily values).
		Raises an ``IOError`` if the file is not found.

		The dataframe contains the following columns:

		"year", "month", "day":
			The observation date
		"bsrn":
			Bartels Solar Rotation Number.
			A sequence of 27-day intervals counted continuously from 1832 Feb 8.
		"rotd":
			Number of Day within the Bartels 27-day cycle (01-27).
		"Kp0", "Kp3", "Kp6", "Kp9", "Kp12", "Kp15", "Kp18", "Kp21":
			Planetary 3-hour Range Index (Kp) for 0000-0300, 0300-0600,
			0600-0900, 0900-1200, 1200-1500, 1500-1800, 1800-2100, 2100-2400 UT
		"Kpsum": Sum of the 8 Kp indices for the day.
			Expressed to the nearest third of a unit.
		"Ap0", "Ap3", "Ap6", "Ap9", "Ap12", "Ap15", "Ap18", "Ap21":
			Planetary Equivalent Amplitude (Ap) for 0000-0300, 0300-0600,
			0600-0900, 0900-1200, 1200-1500, 1500-1800, 1800-2100, 2100-2400 UT
		"Apavg":
			Arithmetic average of the 8 Ap indices for the day.
		"Cp":
			Cp index - the daily planetary character figure, a qualitative
			estimate of the overall level of geomagnetic activity for this day
			determined from the sum of the eight ap amplitudes,
			ranging from 0.0 to 2.5 in steps of 0.1.
		"C9":
			The contracted scale for Cp with only 1 digit, from 0 to 9.
	"""
	_assert_file_exists(gfzpath)
	gfz = np.genfromtxt(
		gfzpath,
		skip_header=3,
		delimiter=[
		#  yy mm dd br db kp kp kp kp kp kp kp kp kps
			2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
		#  ap ap ap ap ap ap ap ap Ap Cp C9
			3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
		],
		dtype=(
			"i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,"
			"i4,i4,i4,i4,i4,i4,i4,i4,i4,f8,i4,"
		),
		names=[
			"year", "month", "day", "bsrn", "rotd",
			"Kp0", "Kp3", "Kp6", "Kp9", "Kp12", "Kp15", "Kp18", "Kp21", "Kpsum",
			"Ap0", "Ap3", "Ap6", "Ap9", "Ap12", "Ap15", "Ap18", "Ap21", "Apavg",
			"Cp", "C9",
		]
	)
	gfz = gfz[gfz["year"] != -1]
	ts = pd.to_datetime([
		"{0:04d}-{1:02d}-{2:02d}".format(2000 + yy if yy < 32 else 1900 + yy, mm, dd)
		for yy, mm, dd in gfz[["year", "month", "day"]]
	])
	gfz_df = pd.DataFrame(gfz, index=ts)
	gfz_df.loc[:, "year"] = ts.year
	# Adjust Kp to 0...9
	kpns = list(map("Kp{0}".format, range(0, 23, 3))) + ["Kpsum"]
	gfz_df[kpns] = 0.1 * gfz_df[kpns]
	return gfz_df


# Common arguments for the public daily and 3h interfaces
_GFZ_COMMON_PARAMS = """
	Parameters
	----------
	gfzpath_all: str, optional, default depending on package install location
		Filename for the large combined index file including the
		historic data, absolute path or relative to the current dir.
	gfzpath_30d: str, optional, default depending on package install location
		Filename for the 30-day (nowcast) index file,
		absolute path or relative to the current dir.
	update: bool, optional, default False
		Attempt to update the local data if it is older than `update_interval`.
	update_interval: str, optional, default "30days"
		The time after which the data are considered "old".
		By default, no automatic re-download is initiated, set `update` to true.
		The online data is updated every 3 hours, thus setting this value to
		a shorter time is not needed and not recommended.
"""


def _doc_param(**sub):
	def dec(obj):
		obj.__doc__ = obj.__doc__.format(**sub)
		return obj
	return dec



[docs]
@_doc_param(params=_GFZ_COMMON_PARAMS)
def gfz_daily(gfzpath_all=GFZ_PATH_ALL, gfzpath_30d=GFZ_PATH_30D, update=False, update_interval="10days"):
	"""Combined daily Ap, Kp, and f10.7 index values

	Combines the "historic" and last-30-day data into one dataframe.

	All arguments are optional and changing them from the defaults should not
	be required neither should it be necessary nor is it recommended.
	{params}
	Returns
	-------
	gfz_df: pandas.DataFrame
		The combined parsed space weather data (daily values).
		Raises ``IOError`` if the data files cannot be found.

	See Also
	--------
	gfz_3h, read_gfz
	"""
	# ensure that the file exists and is up to date
	if (
		not os.path.exists(gfzpath_all)
		or not os.path.exists(gfzpath_30d)
	):
		warn("Could not find space weather data, trying to download.")
		update_gfz(gfzpath_all=gfzpath_all, gfzpath_30d=gfzpath_30d)

	if (
		get_gfz_age(gfzpath_all) > pd.Timedelta("30days")
		or get_gfz_age(gfzpath_30d) > pd.Timedelta(update_interval)
	):
		if update:
			update_gfz(gfzpath_all=gfzpath_all, gfzpath_30d=gfzpath_30d)
		else:
			warn(
				"Local data files are older than {0}, pass `update=True` or "
				"run `gfz.update_gfz()` manually if you need newer data.".format(
					update_interval
				)
			)

	df_all = read_gfz(gfzpath_all)
	df_30d = read_gfz(gfzpath_30d)
	return pd.concat([df_all[:df_30d.index[0]], df_30d[1:]])




[docs]
@_doc_param(params=_GFZ_COMMON_PARAMS)
def gfz_3h(*args, **kwargs):
	"""3h values of Ap and Kp

	Provides the 3-hourly Ap and Kp indices from the full daily data set.

	Accepts the same arguments as `gfz_daily()`.
	All arguments are optional and changing them from the defaults should not
	be required neither should it be necessary nor is it recommended.
	{params}
	Returns
	-------
	gfz_df: pandas.DataFrame
		The combined Ap and Kp index data (3h values).
		The index values are centred at the 3h interval, i.e. at 01:30:00,
		04:30:00, 07:30:00, ... and so on.
		Raises ``IOError`` if the data files cannot be found.

	See Also
	--------
	gfz_daily
	"""
	daily_df = gfz_daily(*args, **kwargs)
	ret = daily_df.copy()
	apns = list(map("Ap{0}".format, range(0, 23, 3)))
	kpns = list(map("Kp{0}".format, range(0, 23, 3)))
	for i, (ap, kp) in enumerate(zip(apns, kpns)):
		ret[ap].index = daily_df[ap].index + pd.Timedelta((i * 3 + 1.5), unit="h")
		ret[kp].index = daily_df[kp].index + pd.Timedelta((i * 3 + 1.5), unit="h")
	gfz_ap = pd.concat(map(ret.__getitem__, apns))
	gfz_kp = pd.concat(map(ret.__getitem__, kpns))
	df = pd.DataFrame({"Ap": gfz_ap, "Kp": gfz_kp})
	return df.reindex(df.index.sort_values())