## Brian Blaylock
## May 3, 2021
"""
============
Herbie Tools
============
"""
from datetime import datetime, timedelta
import logging
import cartopy.crs as ccrs
import metpy # accessor needed to parse crs
import numpy as np
import pandas as pd
import xarray as xr
from herbie.archive import Herbie, wgrib2_idx_to_str
from . import Path
# Multithreading :)
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed, wait
log = logging.getLogger(__name__)
"""
๐งต๐คน๐ปโโ๏ธ Notice! Multithreading and Multiprocessing is use
This is my first implementation of multithreading to create, download,
and read many Herbie objects. This drastically reduces the time it takes
to create a Herbie object (which is just looking for if and where a
GRIB2 file exists on the internet) and to download a file.
"""
def _validate_fxx(fxx):
"""Fast Herbie requires fxx as a list-like"""
if isinstance(fxx, int):
fxx = [fxx]
if not isinstance(fxx, (list, range)):
raise ValueError(f"fxx must be an int, list, or range. Gave {fxx}")
return fxx
def _validate_DATES(DATES):
"""Fast Herbie requires DATES as a list-like"""
if isinstance(DATES, str):
DATES = [pd.to_datetime(DATES)]
elif not hasattr(DATES, "__len__"):
DATES = [pd.to_datetime(DATES)]
if not isinstance(DATES, (list, pd.DatetimeIndex)):
raise ValueError(
f"DATES must be a pandas-parsable datetime string or a list. Gave {DATES}"
)
return DATES
def Herbie_latest(n=6, freq="1H", **kwargs):
"""Search for the most recent GRIB2 file (using multithreading).
Parameters
----------
n : int
Number of attempts to try.
freq : pandas-parsable timedelta string
Time interval between each attempt.
Examples
--------
When ``n=6``, and ``freq='1H'``, Herbie will look for the latest
file within the last 6 hours (suitable for the HRRR model).
When ``n=3``, and ``freq='6H'``, Herbie will look for the latest
file within the last 18 hours (suitable for the GFS model).
"""
current = pd.Timestamp.now("utc").tz_localize(None).floor(freq)
DATES = pd.date_range(
start=current - (pd.Timedelta(freq) * n),
end=current,
freq=freq,
)
FH = FastHerbie(DATES, **kwargs)
return FH.file_exists[-1]
[docs]class FastHerbie:
[docs] def __init__(self, DATES, fxx=[0], *, max_threads=50, **kwargs):
"""Create many Herbie objects with methods to download or read with xarray.
Uses multithreading.
.. note::
Currently, Herbie objects looped by run datetime (date)
and forecast lead time (fxx).
Parameters
----------
DATES : pandas-parsable datetime string or list of datetimes
fxx : int or list of forecast lead times
max_threads : int
Maximum number of threads to use.
kwargs :
Remaining keywords for Herbie object
(e.g., model, product, priority, verbose, etc.)
Benchmark
---------
Creating 48 Herbie objects
- 1 thread took 16 s
- 2 threads took 8 s
- 5 threads took 3.3 s
- 10 threads took 1.7 s
- 50 threads took 0.5 s
"""
self.DATES = _validate_DATES(DATES)
self.fxx = _validate_fxx(fxx)
kwargs.setdefault("verbose", False)
################
# Multithreading
self.tasks = len(DATES) * len(fxx)
threads = min(self.tasks, max_threads)
log.info(f"๐งต Working on {self.tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(threads) as exe:
futures = [
exe.submit(Herbie, date=DATE, fxx=f, **kwargs)
for DATE in DATES
for f in fxx
]
# Return list of Herbie objects in order completed
self.objects = [future.result() for future in as_completed(futures)]
log.info(f"Number of Herbie objects: {len(self.objects)}")
# Sort the list of Herbie objects by lead time then by date
self.objects.sort(key=lambda H: H.fxx)
self.objects.sort(key=lambda H: H.date)
self.objects = self.objects
# Which files exist?
self.file_exists = [H for H in self.objects if H.grib is not None]
self.file_not_exists = [H for H in self.objects if H.grib is None]
if len(self.file_not_exists) > 0:
log.warning(
f"Could not find {len(self.file_not_exists)}/{len(self.file_exists)} GRIB files."
)
def __len__(self):
return len(self.objects)
[docs] def df(self):
"""Organize Herbie objects into a DataFrame.
#? Why is this inefficient? Takes several seconds to display because the __str__ does a lot.
"""
ds_list = [
self.objects[x : x + len(self.fxx)]
for x in range(0, len(self.objects), len(self.fxx))
]
return pd.DataFrame(
ds_list, index=self.DATES, columns=[f"F{i:02d}" for i in self.fxx]
)
[docs] def download(self, searchString=None, *, max_threads=20, **download_kwargs):
r"""Download many Herbie objects
Uses multithreading.
Parameters
----------
searchString : string
Regular expression string to specify which GRIB messages to
download.
**download_kwargs :
Any kwarg for Herbie's download method.
Benchmark
---------
Downloading 48 files with 1 variable (TMP:2 m)
- 1 thread took 1 min 17 s
- 2 threads took 36 s
- 5 threads took 28 s
- 10 threads took 25 s
- 50 threads took 23 s
"""
###########################
# Multithread the downloads
threads = min(self.tasks, max_threads)
log.info(f"๐งต Working on {self.tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(max_threads) as exe:
futures = [
exe.submit(H.download, searchString, **download_kwargs)
for H in self.file_exists
]
# Return list of Herbie objects in order completed
outFiles = [future.result() for future in as_completed(futures)]
return outFiles
[docs] def xarray(
self,
searchString,
*,
max_threads=None,
**xarray_kwargs,
):
"""Read many Herbie objects into an xarray Dataset
# TODO: Sometimes the Jupyter Cell always crashes when I run this.
# TODO: "fatal flex scanner internal error--end of buffer missed"
Uses multithreading (or multiprocessing).
This would likely benefit from multiprocessing instead.
Parameters
----------
max_threads : int
Control the maximum number of threads to use.
If you use too many threads, you may run into memory limits.
Benchmark
---------
Opening 48 files with 1 variable (TMP:2 m)
- 1 thread took 1 min 45 s
- 2 threads took 55 s
- 5 threads took 39 s
- 10 threads took 39 s
- 50 threads took 37 s
"""
xarray_kwargs = dict(searchString=searchString, **xarray_kwargs)
# NOTE: Multiprocessing does not seem to work because it looks
# NOTE: like xarray objects are not pickleable.
# NOTE: ``Reason: 'TypeError("cannot pickle '_thread.lock' object"``
if max_threads:
###########################
# Multithread the downloads
# ! Only works sometimes
# ! I get this error: "'EntryPoint' object has no attribute '_key'""
threads = min(self.tasks, max_threads)
log.info(f"๐งต Working on {self.tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(max_threads) as exe:
futures = [
exe.submit(H.xarray, **xarray_kwargs) for H in self.file_exists
]
# Return list of Herbie objects in order completed
ds_list = [future.result() for future in as_completed(futures)]
else:
ds_list = [H.xarray(**xarray_kwargs) for H in self.file_exists]
# Sort the DataSets, first by lead time (step), then by run time (time)
ds_list.sort(key=lambda x: x.step.data.max())
ds_list.sort(key=lambda x: x.time.data.max())
# Reshape list with dimensions (len(DATES), len(fxx))
ds_list = [
ds_list[x : x + len(self.fxx)]
for x in range(0, len(ds_list), len(self.fxx))
]
# Concat DataSets
try:
ds = xr.combine_nested(
ds_list,
concat_dim=["time", "step"],
combine_attrs="drop_conflicts",
)
except:
# TODO: I'm not sure why some cases doesn't like the combine_attrs argument
ds = xr.combine_nested(
ds_list,
concat_dim=["time", "step"],
)
ds["gribfile_projection"] = ds.gribfile_projection[0][0]
ds = ds.squeeze()
return ds
###############################################################################
########################################################################
########################################################################
[docs]def create_index_files(path, overwrite=False):
"""Create an index file for all GRIB2 files in a directory.
# TODO: use Path().expand()
Parameters
----------
path : str or pathlib.Path
Path to directory or file.
overwrite : bool
Overwrite index file if it exists.
"""
path = Path(path)
files = []
if path.is_dir():
# List all GRIB2 files in the directory
files = list(path.rglob("*.grib2*"))
elif path.is_file():
# The path is a single file
files = [path]
if not files:
raise ValueError(f"No grib2 files were found in {path}")
for f in files:
f_idx = Path(str(f) + ".idx")
if not f_idx.exists() or overwrite:
# Create an index using wgrib2's simple inventory option
# if it doesn't already exist or if overwrite is True.
index_data = wgrib2_idx_to_str(Path(f))
with open(f_idx, "w+") as out_idx:
out_idx.write(index_data)
########################################################################
########################################################################
# !OLD, use FastHerbie class instead
def fast_Herbie(DATES, fxx=[0], *, max_threads=50, **kwargs):
"""
Create many Herbie objects with Multithreading.
.. note::
Currently, Herbie objects looped by run datetime (date)
and forecast lead time (fxx).
Parameters
----------
DATES : pandas-parsable datetime string or list of datetimes
fxx : int or list of forecast lead times
max_threads : int
Maximum number of threads to use.
kwargs :
Remaining keywords for Herbie object
(e.g., model, product, priority, verbose, etc.)
Benchmark
---------
Creating 48 Herbie objects
- 1 thread took 16 s
- 2 threads took 8 s
- 5 threads took 3.3 s
- 10 threads took 1.7 s
- 50 threads took 0.5 s
"""
DATES = _validate_DATES(DATES)
fxx = _validate_fxx(fxx)
kwargs.setdefault("verbose", False)
################
# Multithreading
tasks = len(DATES) * len(fxx)
threads = min(tasks, max_threads)
log.info(f"๐งต Working on {tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(threads) as exe:
futures = [
exe.submit(Herbie, date=DATE, fxx=f, **kwargs)
for DATE in DATES
for f in fxx
]
# Return list of Herbie objects in order completed
H_list = [future.result() for future in as_completed(futures)]
# Return list of Herbie objects in order submitted
# futures, _ = wait(futures)
# H_list = [future.result() for future in futures]
log.info(f"Number of Herbie objects: {len(H_list)}")
# Sort the list of Herbie objects by lead time then by date
H_list.sort(key=lambda H: H.fxx)
H_list.sort(key=lambda H: H.date)
return H_list
# !OLD, use FastHerbie class instead
def fast_Herbie_download(
DATES,
*,
searchString=None,
fxx=[0],
max_threads=20,
download_kw={},
**kwargs,
):
"""
Use multithreading to download many Herbie objects
Benchmark
---------
Downloading 48 files with 1 variable (TMP:2 m)
- 1 thread took 1 min 17 s
- 2 threads took 36 s
- 5 threads took 28 s
- 10 threads took 25 s
- 50 threads took 23 s
"""
DATES = _validate_DATES(DATES)
fxx = _validate_fxx(fxx)
kwargs.setdefault("verbose", False)
Hs = fast_Herbie(DATES, fxx=fxx, max_threads=max_threads, **kwargs)
passed = [H for H in Hs if H.grib is not None]
failed = [H for H in Hs if H.grib is None]
###########################
# Multithread the downloads
tasks = len(DATES) * len(fxx)
threads = min(tasks, max_threads)
log.info(f"๐งต Working on {tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(max_threads) as exe:
futures = [exe.submit(H.download, searchString, **download_kw) for H in passed]
# Return list of Herbie objects in order completed
_ = [future.result() for future in as_completed(futures)]
if len(failed):
log.warning(
f"Herbie only download {len(passed)}/{len(Hs)} files. ({len(failed)} had no GRIB2 file)."
)
return dict(passed=passed, failed=failed)
# !OLD, use FastHerbie class instead
def fast_Herbie_xarray(
DATES,
*,
searchString=None,
fxx=[0],
max_threads=5,
xarray_kw={},
**kwargs,
):
"""
Use multithreading to download many Herbie objects
Parameters
----------
max_threads : int
Control the maximum number of threads to use.
If you use too many threads, you may run into memory limits.
Benchmark
---------
Opening 48 files with 1 variable (TMP:2 m)
- 1 thread took 1 min 45 s
- 2 threads took 55 s
- 5 threads took 39 s
- 10 threads took 39 s
- 50 threads took 37 s
"""
DATES = _validate_DATES(DATES)
fxx = _validate_fxx(fxx)
kwargs.setdefault("verbose", False)
Hs = fast_Herbie(DATES, fxx=fxx, max_threads=max_threads, **kwargs)
passed = [H for H in Hs if H.grib is not None]
failed = [H for H in Hs if H.grib is None]
###########################
# Multithread the downloads
tasks = len(DATES) * len(fxx)
threads = min(tasks, max_threads)
log.info(f"๐งต Working on {tasks} tasks with {threads} threads.")
with ThreadPoolExecutor(max_threads) as exe:
futures = [exe.submit(H.xarray, searchString, **xarray_kw) for H in passed]
# Return list of Herbie objects in order completed
ds_list = [future.result() for future in as_completed(futures)]
# Sort the DataSets, first by lead time (step), then by run time (time)
ds_list.sort(key=lambda x: x.step.data.max())
ds_list.sort(key=lambda x: x.time.data.max())
# Reshape list with dimensions (len(DATES), len(fxx))
ds_list = [ds_list[x : x + len(fxx)] for x in range(0, len(ds_list), len(fxx))]
# Concat DataSets
try:
ds = xr.combine_nested(
ds_list,
concat_dim=["time", "step"],
combine_attrs="drop_conflicts",
)
except:
# TODO: I'm not sure why some cases doesn't like the combine_attrs argument
ds = xr.combine_nested(
ds_list,
concat_dim=["time", "step"],
)
ds["gribfile_projection"] = ds.gribfile_projection[0][0]
ds = ds.squeeze()
if len(failed):
log.warning(
f"Herbie only retrieved {len(passed)}/{len(Hs)} files. ({len(failed)} had no GRIB2 file)."
)
return ds
# ! Old: Use FastHerbie instead
def bulk_download(DATES, searchString=None, *, fxx=range(0, 1), verbose=True, **kwargs):
"""
Bulk download GRIB2 files from file source to the local machine.
Iterates over a list of datetimes (DATES) and forecast lead times (fxx).
Parameters
----------
DATES : list
List of datetimes
searchString : None or str
If None, download the full file. If string, use regex to search
index files for variables and levels of interest and only
download the matched GRIB messages.
fxx : int or list
List of forecast lead times to download. Default only downloads model analysis.
model : {'hrrr', 'hrrrak', 'rap'}
Model to download.
product : {'sfc', 'prs', 'nat', 'subh'}
Variable products file to download. Not needed for RAP model.
"""
log.warning("`bulk_download` is depreciated. Use `fast_Herbie_download` instead")
if isinstance(DATES, (str, pd.Timestamp)) or hasattr(DATES, "strptime"):
DATES = [DATES]
if isinstance(fxx, int):
fxx = [fxx]
# Locate the file sources
print("๐จ๐ปโ๐ฌ Check which requested files exists")
grib_sources = fast_Herbie(DATES, fxx, **kwargs)
# Keep a list of successful and failed Herbie objects
success = []
failed = []
loop_time = timedelta()
n = len(grib_sources)
print("\n๐ง Download requested data")
for i, H in enumerate(grib_sources):
try:
timer = datetime.now()
H.download(searchString=searchString)
# ---------------------------------------------------------
# Time keeping: *crude* method to estimate remaining time.
# ---------------------------------------------------------
loop_time += datetime.now() - timer
mean_dt_per_loop = loop_time / (i + 1)
remaining_loops = n - i - 1
est_rem_time = mean_dt_per_loop * remaining_loops
success.append(H)
except Exception as e:
print(f"WARNING: {e}")
failed.append(H)
if verbose:
print(
f"๐๐จ Download Progress: [{i+1}/{n} completed] >> Est. Time Remaining {str(est_rem_time):16}\n"
)
# ---------------------------------------------------------
requested = len(grib_sources)
completed = sum([i.grib is not None for i in grib_sources])
print(f"๐ฆ Done! Downloaded [{completed}/{requested}] files. Timer={loop_time}")
return dict(success=success, failed=failed)
# ! Old: Use FastHerbie instead
def xr_concat_sameRun(DATE, searchString, fxx=range(0, 18), verbose=False, **kwargs):
"""
Load and concatenate xarray objects by forecast lead time for the same run.
Parameters
----------
DATE : pandas-parsable datetime
A datetime that represents the model initialization time.
searchString : str
Variable fields to load. This really only works if the search
string returns data on the same hyper cube.
fxx : list of int
List of forecast lead times, in hours, to concat together.
"""
log.warning("`xr_concat_sameRun` is depreciated. Use `fast_Herbie_xarray` instead")
Hs = fast_Herbie(DATE, fxx, **kwargs)
Hs_to_cat = [H.xarray(searchString, verbose=verbose) for H in Hs]
return xr.concat(Hs_to_cat, dim="f")
# ! Old: Use FastHerbie instead
def xr_concat_sameLead(DATES, searchString, fxx=0, verbose=False, **kwargs):
"""
Load and concatenate xarray objects by model initialization date for the same lead time.
Parameters
----------
DATES : list of pandas-parsable datetime
Datetime that represents the model valid time.
searchString : str
Variable fields to load. This really only works if the search
string returns data on the same hyper cube.
fxx : int
The forecast lead time, in hours.
"""
log.warning("`xr_concat_sameLead` is depreciated. Use `fast_Herbie_xarray` instead")
Hs = fast_Herbie(DATES, fxx, **kwargs)
Hs_to_cat = [H.xarray(searchString, verbose=verbose) for H in Hs]
return xr.concat(Hs_to_cat, dim="t")
# ! Old: Use herbie accessor instead
def nearest_points(ds, points, names=None, verbose=True):
"""
Get the nearest latitude/longitude points from a xarray Dataset.
This is **much** faster than my old "pluck_points" method. For
matchign 1,948 points,
- `nearest_points` completed in 7.5 seconds.
- `pluck_points` completed in 2 minutes.
Info
----
- Stack Overflow: https://stackoverflow.com/questions/58758480/xarray-select-nearest-lat-lon-with-multi-dimension-coordinates
- MetPy Details: https://unidata.github.io/MetPy/latest/tutorials/xarray_tutorial.html?highlight=assign_y_x
Parameters
----------
ds : a friendly xarray Dataset
points : tuple (lon, lat) or list of tuples
The longitude and latitude (lon, lat) coordinate pair (as a tuple)
for the points you want to pluck from the gridded Dataset.
A list of tuples may be given to return the values from multiple points.
names : list
A list of names for each point location (i.e., station name).
None will not append any names. names should be the same
length as points.
"""
log.warning(
"Depreciated: `nearest_points` is now a herbie accessor. Use `ds.herbie.nearest_points()`"
)
# Check if MetPy has already parsed the CF metadata grid projection.
# Do that if it hasn't been done yet.
if "metpy_crs" not in ds:
ds = ds.metpy.parse_cf()
# Apply the MetPy method `assign_y_x` to the dataset
# https://unidata.github.io/MetPy/latest/api/generated/metpy.xarray.html?highlight=assign_y_x#metpy.xarray.MetPyDataArrayAccessor.assign_y_x
ds = ds.metpy.assign_y_x()
# Convert the requested [(lon,lat), (lon,lat)] points to map projection.
# Accept a list of point tuples, or Shapely Points object.
# We want to index the dataset at a single point.
# We can do this by transforming a lat/lon point to the grid location
crs = ds.metpy_crs.item().to_cartopy()
# lat/lon input must be a numpy array, not a list or polygon
if isinstance(points, tuple):
# If a tuple is give, turn into a one-item list.
points = np.array([points])
if not isinstance(points, np.ndarray):
# Points must be a 2D numpy array
points = np.array(points)
lons = points[:, 0]
lats = points[:, 1]
transformed_data = crs.transform_points(ccrs.PlateCarree(), lons, lats)
xs = transformed_data[:, 0]
ys = transformed_data[:, 1]
# Select the nearest points from the projection coordinates.
# TODO: Is there a better way?
# There doesn't seem to be a way to get just the points like this
# ds = ds.sel(x=xs, y=ys, method='nearest')
# because it gives a 2D array, and not a point-by-point index.
# Instead, I have too loop the ds.sel method
new_ds = xr.concat(
[ds.sel(x=xi, y=yi, method="nearest") for xi, yi in zip(xs, ys)], dim="point"
)
# Add list of names as a coordinate
if names is not None:
# Assign the point dimension as the names.
assert len(points) == len(names), "`points` and `names` must be same length."
new_ds["point"] = names
return new_ds
# TODO: I like the idea in Salem to mask data by a geographic region
# TODO: Maybe can use that in Herbie. https://github.com/fmaussion/salem