Source code for energia.utils.nsrdb

"""Fetch data from NREL's NSRDB database"""

import logging

from numpy import array, average
from pandas import DataFrame, to_datetime
from scipy.spatial import cKDTree

logger = logging.getLogger("energia")

try:
    import h5pyd

    import_all = False
except ImportError:
    import_all = True


[docs] def fetch_nsrdb_data( attrs: list[str], year: int, lat_lon: tuple[float] | None = None, state: str = "", county: str = "", resolution: str = "", get: str = "max-population", save: str | None = None, ) -> DataFrame | tuple: """ Fetches nsrdb data from nearest coordinates (latitude, longitude) or from county in a state matching a particular 'get' metric :param attrs: attributes to fetch :type attrs: list[str] :param year: year of choice, e.g. 2019 :type year: int :param lat_lon: (latitude, longitude) to fetch closest data point. Defaults to None. :type lat_lon: tuple[float] | None :param state: capitalized state name, e.g. 'Texas' . Defaults to ''. :type state: str :param county: capitalized county name, e.g. 'Brazos' . Defaults to ''. :type county: str :param resolution: choose from 'halfhourly', 'hourly', 'daily'. Defaults to ''. :type resolution: str :param get: Defaults to 'max-population'. From within county choose the data point that matches one of the following. 'max-population', 'max-elevation', 'max-landcover' 'min-population', 'min-elevation', 'min-landcover' :type get: str :param save: path to save the data. Defaults to None. :type save: str | None :return: DataFrame with output data, (latitude, longitude) :rtype: DataFrame | tuple """ if import_all: logger.warning( "⚠ This is an optional feature. Please install h5pyd, or pip install energiapy[all] ⚠", ) return None # fetches nsrdb data for the year nsrdb_data = h5pyd.File(f"/nrel/nsrdb/v3/nsrdb_{year!s}.h5", "r") time_index = to_datetime(nsrdb_data["time_index"][...].astype(str)) if lat_lon is not None: # get coordinates for all locations coords = nsrdb_data["coordinates"][...] tree = cKDTree(coords) # find the data point closest to latitude and longitude def nearest_site(tree, latitude, longitude): lat_lon_query = array([latitude, longitude]) # dist, pos = tree.query(lat_lon_query) return tree.query(lat_lon_query)[1] idx = nearest_site(tree=tree, latitude=lat_lon[0], longitude=lat_lon[1]) else: # gets coordinates and associated data meta = DataFrame(nsrdb_data["meta"][...]) # data matching state coordinates state_data = meta.loc[meta["state"] == str.encode(state)] county_data = state_data.loc[ state_data["county"] == str.encode(county) ] # data matching county # splits the get string, e.g. max - population, gives [max, # population(get_metric)] get_metric = get.split("-")[1] if get.split("-")[0] == "min": latitude = float( county_data["latitude"][ county_data[get_metric] == min(county_data[get_metric]) ].iloc[0], ) longitude = float( county_data["longitude"][ county_data[get_metric] == min(county_data[get_metric]) ].iloc[0], ) loc_data = county_data.loc[ (county_data["latitude"] == latitude) & (county_data["longitude"] == longitude) ] if get.split("-")[0] == "max": latitude = float( county_data["latitude"][ county_data[get_metric] == max(county_data[get_metric]) ].iloc[0], ) longitude = float( county_data["longitude"][ county_data[get_metric] == max(county_data[get_metric]) ].iloc[0], ) loc_data = county_data.loc[ (county_data["latitude"] == latitude) & (county_data["longitude"] == longitude) ] idx = loc_data.index[0] lat_lon = (latitude, longitude) timestep_dict = { "halfhourly": 1, # native data set at 30 mins "hourly": 2, # averages over the hour "daily": 48, # averages over the day } averaged_output = DataFrame() psm_scale_dict = { attr: nsrdb_data[attr].attrs["psm_scale_factor"] for attr in attrs } for attr in attrs: full_output = nsrdb_data[attr][:, idx] # native data set at 30 mins averaged_output[attr] = average( full_output.reshape(-1, timestep_dict[resolution]), axis=1, ) # averages over resolution averaged_output = averaged_output.set_index( time_index[:: timestep_dict[resolution]], ) for attr in attrs: averaged_output[attr] = averaged_output[attr] / psm_scale_dict[attr] if save is not None: averaged_output.to_csv(save + ".csv") return lat_lon, averaged_output