Source code for herbie.archive

#!/usr/bin/env python3

## Brian Blaylock
## May 6, 2022

"""
===============================
Herbie: Retrieve NWP Model Data
===============================

Herbie is your model output download assistant with a mind of his own!
Herbie might look small on the outside, but he has a big heart on the
inside and will get you to the
`finish line <https://www.youtube.com/watch?v=4XWufUZ1mxQ&t=189s>`_.
Happy racing! 🏁

`📓 Documentation <https://blaylockbk.github.io/Herbie/_build/html/>`_

With Herbie's API, you can search and download GRIB2 model output files
from different archive sources for the High-Resolution Rapid Refresh
(HRRR) HRRR-Alaska, Rapid Refresh (RAP), Global Forecast System (GFS),
and others.

Herbie looks for GRIB2 model output data from NOMADS, NOAA's Big Data
Project partners (Amazon Web Services, Google Cloud Platform, and
Microsoft Azure), and the CHPC Pando archive at the University of Utah.

Herbie supports subsetting of GRIB2 files by individual GRIB
messages (i.e. variable and level) when the index (.idx) file exist and
help you open them with xarray/cfgrib.

Herbie is extendable to support other models. Simply create a template
file in the ``herbie/models`` directory and make a pull-request.

For more details, see https://blaylockbk.github.io/Herbie/_build/html/user_guide/data_sources.html

.. note:: Updates since the ``Herbie 0.0.9`` release

    - TODO: Rename 'searchString' to 'subset' (and rename subset function)
    - TODO: add `idx_to_df()` and `df_to_idx()` methods.
    - TODO: clean up document examples. It's kind of scattered now.

"""
import functools
import hashlib
import itertools
import json
import logging
import os
import sys
import urllib.request
import warnings
from datetime import datetime, timedelta
from io import StringIO

import cfgrib
import pandas as pd
import pygrib
import requests
import xarray as xr
from pyproj import CRS

import herbie.models as model_templates
from herbie import Path, config
from herbie.help import _searchString_help
from herbie.misc import ANSI

# NOTE: The config dict values are retrieved from __init__ and read
# from the file ${HOME}/.config/herbie/config.toml
# Path is imported from __init__ because it has my custom methods.

try:
    # Load custom xarray accessors
    import herbie.accessors
except:
    warnings.warn(
        "herbie xarray accessors could not be imported."
        "Probably missing a dependency like MetPy."
        "If you want to use these functions, try"
        "`pip install metpy`"
    )
    pass

log = logging.getLogger(__name__)


def wgrib2_idx_to_str(GRIB2_FILEPATH):
    """Produce the index file as a string with wgrib2"""
    import subprocess
    from shutil import which

    if which("wgrib2") is None:
        raise RuntimeError("wgrib2 command was not found.")

    cmd = f"wgrib2 -s {GRIB2_FILEPATH}"
    out = subprocess.run(cmd, shell=True, capture_output=True, check=True)
    return out.stdout.decode("utf-8")


[docs]class Herbie: """ Locate GRIB2 file at one of the archive sources. Parameters ---------- date : pandas-parsable datetime *Model initialization datetime*. If None, then must set ``valid_date``. valid_date : pandas-parsable datetime Model valid datetime. Must set when ``date`` is None. fxx : int Forecast lead time in hours. Available lead times depend on the model type and model version. Range is model and run dependant. model : {'hrrr', 'hrrrak', 'rap', 'gfs', 'gfs_wave', 'ecmwf', 'rrfs', etc.} Model name as defined in the models template folder. CASE INSENSITIVE Some examples: - ``'hrrr'`` HRRR contiguous United States model - ``'hrrrak'`` HRRR Alaska model (alias ``'alaska'``) - ``'rap'`` RAP model - ``'ecmwf'`` ECMWF open data forecat products product : {'sfc', 'prs', 'nat', 'subh'} Output variable product file type. If not specified, will use first product in model template file. CASE SENSITIVE. For example, the HRRR model has these products: - ``'sfc'`` surface fields - ``'prs'`` pressure fields - ``'nat'`` native fields - ``'subh'`` subhourly fields member : None or int Some ensemble models (e.g. the future RRFS) will need to specify an ensemble member. priority : list or str List of model sources to get the data in the order of download priority. CASE INSENSITIVE. Some example data sources and the default priority order are listed below. - ``'aws'`` Amazon Web Services (Big Data Program) - ``'nomads'`` NOAA's NOMADS server - ``'google'`` Google Cloud Platform (Big Data Program) - ``'azure'`` Microsoft Azure (Big Data Program) - ``'pando'`` University of Utah Pando Archive (gateway 1) - ``'pando2'`` University of Utah Pando Archive (gateway 2) save_dir : str or pathlib.Path Location to save GRIB2 files locally. Default save directory is set in ``~/.config/herbie/config.cfg``. Overwrite : bool If True, look for GRIB2 files even if local copy exists. If False (default), use the local copy (still need to find the idx file). **kwargs Any other paremeter needed to satisfy the conditions in the model template file (e.g., nest=2, other_label='run2') """
[docs] def __init__( self, date=None, *, valid_date=None, model=config["default"].get("model"), fxx=config["default"].get("fxx"), product=config["default"].get("product"), priority=config["default"].get("priority"), save_dir=config["default"].get("save_dir"), overwrite=config["default"].get("overwrite", False), verbose=config["default"].get("verbose", True), **kwargs, ): """ Specify model output and find GRIB2 file at one of the sources. """ self.fxx = fxx if date: # User supplied `date`, which is the model initialization datetime. self.date = pd.to_datetime(date) self.valid_date = self.date + timedelta(hours=self.fxx) elif valid_date: # User supplied `valid_date`, which is the model valid datetime. self.valid_date = pd.to_datetime(valid_date) self.date = self.valid_date - timedelta(hours=self.fxx) else: raise ValueError("Must specify either `date` or `valid_date`") self.model = model.lower() self.product = product self.priority = priority self.save_dir = Path(save_dir).expand() self.overwrite = overwrite self.verbose = verbose # Some model templates may require kwargs not listed (e.g., `nest=`, `member=`). for key, value in kwargs.items(): # TODO: Check if the kwarg is a config default. # TODO: e.g. if a user primarily works with RRFS, they may # TODO: want to configure "member" as a default argument. # You may also set IDX_SUFFIX as an argument. setattr(self, key, value) # Get details from the template of the specified model. # This attaches the details from the `models.<model>.template` # class to this Herbie object. # This line is equivalent to `model_templates.gfs.template(self)`. # I do it this way because the model name is a variable. # (see https://stackoverflow.com/a/7936588/2383070 for what I'm doing here) getattr(model_templates, self.model).template(self) if product is None: # The user didn't specify a product, so let's use the first # product in the model template. self.product = list(self.PRODUCTS)[0] log.info(f'`product` not specified. Will use "{self.product}".') # We need to rerun this so the sources have the new product value. getattr(model_templates, self.model).template(self) self.product_description = self.PRODUCTS[self.product] # Specify the suffix for the inventory index files. # Default value is `.grib2.idx`, but some have weird suffix, # like archived RAP on NCEI are `.grb2.inv`. self.IDX_SUFFIX = getattr(self, "IDX_SUFFIX", [".grib2.idx"]) # Specify the index file type. By default, Herbie assumes the # index file was created with wgrib2. # But for ecmwf files with index files created with eccodes # the index files are in a different style. self.IDX_STYLE = getattr(self, "IDX_STYLE", "wgrib2") self.searchString_help = _searchString_help(self.IDX_STYLE) # Check the user input self._validate() # Ok, now we are ready to look for the GRIB2 file at each of the remote sources. # self.grib is the first existing GRIB2 file discovered. # self.idx is the first existing index file discovered. self.grib, self.grib_source = self.find_grib() self.idx, self.idx_source = self.find_idx() if verbose: # ANSI colors added for style points if any([self.grib is not None, self.idx is not None]): print( f"✅ Found", f"┊ model={self.model}", f"┊ {ANSI.italic}product={self.product}{ANSI.reset}", f"┊ {ANSI.green}{self.date:%Y-%b-%d %H:%M UTC}{ANSI.bright_green} F{self.fxx:02d}{ANSI.reset}", f"┊ {ANSI.orange}{ANSI.italic}GRIB2 @ {self.grib_source}{ANSI.reset}", f"┊ {ANSI.orange}{ANSI.italic}IDX @ {self.idx_source}{ANSI.reset}", ) else: print( f"💔 Did not find", f"┊ model={self.model}", f"┊ {ANSI.italic}product={self.product}{ANSI.reset}", f"┊ {ANSI.green}{self.date:%Y-%b-%d %H:%M UTC}{ANSI.bright_green} F{self.fxx:02d}{ANSI.reset}", )
def __repr__(self): """Representation in Notebook""" msg = ( f"{ANSI.herbie} {self.model.upper()} model", f"{ANSI.italic}{self.product}{ANSI.reset} product initialized", f"{ANSI.green}{self.date:%Y-%b-%d %H:%M UTC}{ANSI.bright_green} F{self.fxx:02d}{ANSI.reset}", f"┊ {ANSI.orange}{ANSI.italic}source={self.grib_source}{ANSI.reset}", ) return " ".join(msg) def __str__(self): """When Herbie class object is printed, print all properties.""" # * Keep this simple so it runs fast. msg = (f"║HERBIE╠ {self.model.upper()}:{self.product}",) return " ".join(msg)
[docs] def tell_me_everything(self): """Print all the attributes of the Herbie object""" msg = [] for i in dir(self): if isinstance(getattr(self, i), (int, str, dict)): if not i.startswith("__"): msg.append(f"self.{i}={getattr(self, i)}") msg = "\n".join(msg) print(msg)
def __logo__(self): """For Fun, show the Herbie Logo""" print(ANSI.ascii) def _validate(self): """Validate the Herbie class input arguments""" # Accept model alias if self.model.lower() == "alaska": self.model = "hrrrak" _models = {m for m in dir(model_templates) if not m.startswith("__")} _products = set(self.PRODUCTS) assert self.date < datetime.utcnow(), "🔮 `date` cannot be in the future." assert self.model in _models, f"`model` must be one of {_models}" assert self.product in _products, f"`product` must be one of {_products}" if isinstance(self.IDX_SUFFIX, str): self.IDX_SUFFIX = [self.IDX_SUFFIX] if isinstance(self.priority, str): self.priority = [self.priority] if self.priority is not None: self.priority = [i.lower() for i in self.priority] # Don't look for data from NOMADS if requested date is earlier # than 14 days ago. NOMADS doesn't keep data that old, # (I think this is true of all models). if "nomads" in self.priority: expired = datetime.utcnow() - timedelta(days=14) expired = pd.to_datetime(f"{expired:%Y-%m-%d}") if self.date < expired: self.priority.remove("nomads") def _ping_pando(self): """Pinging the Pando server before downloading can prevent a bad handshake.""" try: requests.head("https://pando-rgw01.chpc.utah.edu/") except: print("🤝🏻⛔ Bad handshake with pando? Am I able to move on?") pass def _check_grib(self, url): """Check that the GRIB2 URL exist and is of useful length.""" head = requests.head(url) check_exists = head.ok if check_exists: check_content = int(head.raw.info()["Content-Length"]) > 1_000_000 return check_exists and check_content else: return False def _check_idx(self, url, verbose=False): """Check if an index file exist for the GRIB2 URL.""" # To check inventory files with slightly different URL structure # we will loop through the IDX_SUFFIX. if verbose: print(f"🐜 {self.IDX_SUFFIX=}") # Loop through IDX_SUFFIX options until we find one that exists for i in self.IDX_SUFFIX: if Path(url).suffix in {".grb", ".grib", ".grb2", ".grib2"}: idx_url = url.rsplit(".", maxsplit=1)[0] + i else: idx_url = url + i idx_exists = requests.head(idx_url).ok if verbose: print(f"🐜 {idx_url=}") print(f"🐜 {idx_exists=}") if idx_exists: return idx_exists, idx_url if verbose: print( f"⚠ Herbie didn't find any inventory files that", f"exists from {self.IDX_SUFFIX}", ) return False, None
[docs] def find_grib(self, overwrite=False): """Find a GRIB file from the archive sources Returns ------- 1) The URL or pathlib.Path to the GRIB2 files that exists 2) The source of the GRIB2 file """ # But first, check if the GRIB2 file exists locally. local_grib = self.get_localFilePath() if local_grib.exists() and not self.overwrite: return local_grib, "local" # NOTE: We will still get the idx files from a remote # because they aren't stored locally, or are they? # TODO: If the idx file is local, then use that # If priority list is set, we want to search SOURCES in that # priority order. If priority is None, then search all SOURCES # in the order given by the model template file. # NOTE: A source from the template will not be used if it is not # included in the priority list. if self.priority is not None: self.SOURCES = { key: self.SOURCES[key] for key in self.priority if key in self.SOURCES } # Ok, NOW we are ready to search for the remote GRIB2 files... for source in self.SOURCES: if "pando" in source: # Sometimes pando returns a bad handshake. Pinging # pando first can help prevent that. self._ping_pando() # Get the file URL for the source and determine if the # GRIB2 file and the index file exist. If found, store the # URL for the GRIB2 file and the .idx file. grib_url = self.SOURCES[source] if self._check_grib(grib_url): return [grib_url, source] return [None, None]
[docs] def find_idx(self): """Find an index file for the GRIB file""" # If priority list is set, we want to search SOURCES in that # priority order. If priority is None, then search all SOURCES # in the order given by the model template file. # NOTE: A source from the template will not be used if it is not # included in the priority list. if self.priority is not None: self.SOURCES = { key: self.SOURCES[key] for key in self.priority if key in self.SOURCES } # Ok, NOW we are ready to search for the remote GRIB2 files... for source in self.SOURCES: if "pando" in source: # Sometimes pando returns a bad handshake. Pinging # pando first can help prevent that. self._ping_pando() # Get the file URL for the source and determine if the # GRIB2 file and the index file exist. If found, store the # URL for the GRIB2 file and the .idx file. grib_url = self.SOURCES[source] if source == "local": local_grib = Path(grib_url).expand() local_idx = local_grib.with_suffix(self.IDX_SUFFIX[0]) return [local_idx, "local"] idx_exists, idx_url = self._check_idx(grib_url) if idx_exists: return [idx_url, source] return [None, None]
@property def get_remoteFileName(self, source=None): """Predict Remote File Name""" if source is None: source = list(self.SOURCES)[0] return self.SOURCES[source].split("/")[-1] @property def get_localFileName(self): """Predict Local File Name of the full file""" return self.LOCALFILE
[docs] def get_localFilePath(self, searchString=None): """Get path to local file""" if list(self.SOURCES)[0] == "local": # TODO: An experimental special case for locally stored GRIB2. outFile = Path(self.SOURCES["local"]).expand() else: outFile = ( self.save_dir.expand() / self.model / f"{self.date:%Y%m%d}" / self.get_localFileName ) if searchString is not None: # Reassign the index DataFrame with the requested searchString idx_df = self.read_idx(searchString) # ====================================== # Make a unique filename for the subset # Get a list of all GRIB message numbers. We will use this # in the output file name as a unique identifier. all_grib_msg = "-".join([f"{i:g}" for i in idx_df.index]) # To prevent "filename too long" error, create a hash to # that represents the file name and subseted variables to # shorten the name. # I want the files to still be sorted by date, fxx, and # subset fields, so include three separate hashes to similar # files will be sorted together. hash_date = hashlib.blake2b( f"{self.date:%Y%m%d%H%M}".encode(), digest_size=1 ).hexdigest() hash_fxx = hashlib.blake2b( f"{self.fxx}".encode(), digest_size=1 ).hexdigest() hash_label = hashlib.blake2b( all_grib_msg.encode(), digest_size=2 ).hexdigest() # Prepend the filename with the hash label to distinguish it # from the full file. The hash label is a cryptic # representation of the GRIB messages in the subset. outFile = ( outFile.parent / f"subset_{hash_date}{hash_fxx}{hash_label}__{outFile.name}" ) return outFile
@functools.cached_property def index_as_dataframe(self): """Read and cache the full index file""" # If the index file does not exists on the archive, but we have # downloaded the full file (it is local), then we can use wgrib2 # to get the index file. if self.idx is None: if self.grib_source == "local": # Use wgrib2 to get the index file if the file is local log.info("🧙🏻‍♂️ I'll use wgrib2 to create the missing index file.") self.idx = StringIO(wgrib2_idx_to_str(self.get_localFilePath())) self.IDX_STYLE = "wgrib2" else: raise ValueError( f"\nNo index file was found for . \n" f"Download the full file first (with `H.download()`).\n" f"You will need to remake the Herbie object (H = `Herbie()`)\n" f"or delete this cached property: `del H.index_as_dataframe()`" ) assert self.idx is not None, f"No index file found for {self.grib}." if self.IDX_STYLE == "wgrib2": # Sometimes idx end in ':', other times it doesn't (in some Pando files). # https://pando-rgw01.chpc.utah.edu/hrrr/sfc/20180101/hrrr.t00z.wrfsfcf00.grib2.idx # https://noaa-hrrr-bdp-pds.s3.amazonaws.com/hrrr.20210101/conus/hrrr.t00z.wrfsfcf00.grib2.idx # Sometimes idx has more than the standard messages # https://noaa-nbm-grib2-pds.s3.amazonaws.com/blend.20210711/13/core/blend.t13z.core.f001.co.grib2.idx df = pd.read_csv( self.idx, sep=":", names=[ "grib_message", "start_byte", "reference_time", "variable", "level", "forecast_time", "?", "??", "???", ], ) # Format the DataFrame df["reference_time"] = pd.to_datetime( df.reference_time, format="d=%Y%m%d%H" ) df["valid_time"] = df["reference_time"] + pd.to_timedelta(f"{self.fxx}H") df["start_byte"] = df["start_byte"].astype(int) df["end_byte"] = df["start_byte"].shift(-1, fill_value="") # TODO: Check this works: Assign the ending byte for the last row... # TODO: df["end_byte"] = df["start_byte"].shift(-1, fill_value=requests.get(self.grib, stream=True).headers['Content-Length']) # TODO: Based on what Karl Schnieder did. df["range"] = df.start_byte.astype(str) + "-" + df.end_byte.astype(str) df = df.reindex( columns=[ "grib_message", "start_byte", "end_byte", "range", "reference_time", "valid_time", "variable", "level", "forecast_time", "?", "??", "???", ] ) df = df.dropna(how="all", axis=1) df = df.fillna("") df["search_this"] = ( df.loc[:, "variable":] .astype(str) .apply( lambda x: ":" + ":".join(x).rstrip(":").replace(":nan:", ":"), axis=1, ) ) if self.IDX_STYLE == "eccodes": # eccodes keywords explained here: # https://confluence.ecmwf.int/display/UDOC/Identification+keywords r = requests.get(self.idx) idxs = [json.loads(x) for x in r.text.split("\n") if x] df = pd.DataFrame(idxs) # Format the DataFrame df.index = df.index.rename("grib_message") df.index += 1 df = df.reset_index() df["start_byte"] = df["_offset"] df["end_byte"] = df["_offset"] + df["_length"] df["range"] = df.start_byte.astype(str) + "-" + df.end_byte.astype(str) df["reference_time"] = pd.to_datetime( df.date + df.time, format="%Y%m%d%H%M" ) df["step"] = pd.to_timedelta(df.step.astype(int), unit="H") df["valid_time"] = df.reference_time + df.step df = df.reindex( columns=[ "grib_message", "start_byte", "end_byte", "range", "reference_time", "valid_time", "step", # ---- Used for searchString ------------------------------ "param", # parameter field (variable) "levelist", # level "levtype", # sfc=surface, pl=pressure level, pt=potential vorticity "number", # model number (used in ensemble products) "domain", # g=global "expver", # experiment version "class", # classification (od=routing operations, rd=research, ) "type", # fc=forecast, an=analysis, "stream", # oper=operationa, wave=wave, ef/enfo=ensemble, ] ) df["search_this"] = ( df.loc[:, "param":] .astype(str) .apply( lambda x: ":" + ":".join(x).rstrip(":").replace(":nan:", ":"), axis=1, ) ) # Attach some attributes df.attrs = dict( url=self.idx, source=self.idx_source, description="Inventory index file for the GRIB2 file.", model=self.model, product=self.product, lead_time=self.fxx, datetime=self.date, ) return df
[docs] def read_idx(self, searchString=None): """ Inspect the GRIB2 file contents by reading the index file. This reads index files created with the wgrib2 utility. Parameters ---------- searchString : str Filter dataframe by a searchString regular expression. Searches for strings in the index file lines, specifically the variable, level, and forecast_time columns. Execute ``_searchString_help()`` for examples of a good searchString. .. include:: ../../user_guide/searchString.rst Returns ------- A Pandas DataFrame of the index file. """ df = self.index_as_dataframe # Filter DataFrame by searchString if searchString not in [None, ":"]: logic = df.search_this.str.contains(searchString) if logic.sum() == 0: print( f"No GRIB messages found. There might be something wrong with {searchString=}" ) print(_searchString_help(kind=self.IDX_STYLE)) df = df.loc[logic] return df
[docs] def download( self, searchString=None, *, source=None, save_dir=None, overwrite=None, verbose=None, errors="warn", ): """ Download file from source. TODO: When we download a full file, the value of self.grib and TODO: self.grib_source should change to represent the local file. Subsetting by variable follows the same principles described here: https://www.cpc.ncep.noaa.gov/products/wesley/fast_downloading_grib.html Parameters ---------- searchString : str If None, download the full file. Else, use regex to subset the file by specific variables and levels. .. include:: ../../user_guide/searchString.rst source : {'nomads', 'aws', 'google', 'azure', 'pando', 'pando2'} If None, download GRIB2 file from self.grib2 which is the first location the GRIB2 file was found from the priority lists when this class was initialized. Else, you may specify the source to force downloading it from a different location. save_dir : str or pathlib.Path Location to save the model output files. If None, uses the default or path specified in __init__. Else, changes the path files are saved. overwrite : bool If True, overwrite existing files. Default will skip downloading if the full file exists. Not applicable when when searchString is not None because file subsets might be unique. errors : {'warn', 'raise'} When an error occurs, send a warning or raise a value error. """ def _reporthook(a, b, c): """ Print download progress in megabytes. Parameters ---------- a : Chunk number b : Maximum chunk size c : Total size of the download """ chunk_progress = a * b / c * 100 total_size_MB = c / 1000000.0 print( f"\r🚛💨 Download Progress: {chunk_progress:.2f}% of {total_size_MB:.1f} MB\r", end="", ) def subset(searchString, outFile): """ Download a subset specified by the regex searchString """ # TODO An alternative to downloadling subset with curl is # TODO to use the request module directly. # TODO >> headers = dict(Range=f"bytes={start_bytes}-{end_bytes}") # TODO >> r = requests.get(grib_url, headers=headers) grib_source = self.grib if hasattr(grib_source, "as_posix") and grib_source.exists(): # The GRIB source is local. Curl the local file # See https://stackoverflow.com/a/21023161/2383070 grib_source = f"file://{str(self.grib)}" if verbose: print( f'📇 Download subset: {self.__repr__()}{" ":60s}\n cURL from {grib_source}' ) # Download subsets of the file by byte range with cURL. # > Instead of using a single curl command for each row, # > group adjacent messages in the same curl command. # Find index groupings # TODO: Improve this for readability # https://stackoverflow.com/a/32199363/2383070 idx_df = self.read_idx(searchString) li = idx_df.index inds = ( [0] + [ind for ind, (i, j) in enumerate(zip(li, li[1:]), 1) if j - i > 1] + [len(li) + 1] ) curl_groups = [li[i:j] for i, j in zip(inds, inds[1:])] curl_ranges = [] group_dfs = [] for i, group in enumerate(curl_groups): _df = idx_df.loc[group] curl_ranges.append(f"{_df.iloc[0].start_byte}-{_df.iloc[-1].end_byte}") group_dfs.append(_df) for i, (range, _df) in enumerate(zip(curl_ranges, group_dfs)): if verbose: for i, row in _df.iterrows(): print( f" {row.grib_message:<3g} {ANSI.orange}{row.search_this}{ANSI.reset}" ) if i == 0: # If we are working on the first item, overwrite the existing file... curl = f"curl -s --range {range} {grib_source} > {outFile}" else: # ...all other messages are appended to the subset file. curl = f"curl -s --range {range} {grib_source} >> {outFile}" os.system(curl) if verbose: print(f"💾 Saved the subset to {outFile}") # If the file exists in the localPath and we don't want to # overwrite, then we don't need to download it. outFile = self.get_localFilePath(searchString=searchString) # This overrides the overwrite specified in __init__ if overwrite is not None: self.overwrite = overwrite # This overrides the verbose specified in __init__ if verbose is not None: self.verbose = verbose if outFile.exists() and not self.overwrite: if verbose: print(f"🌉 Already have local copy --> {outFile}") return outFile if self.overwrite and self.grib_source == "local": # Search for the grib files on the remote archives again self.grib, self.grib_source = self.find_grib(overwrite=True) self.idx, self.idx_source = self.find_idx() print(f"Overwrite local file with file from [{self.grib_source}]") # This overrides the save_dir specified in __init__ if save_dir is not None: self.save_dir = Path(save_dir).expand() if not hasattr(Path(self.save_dir).expand(), "exists"): self.save_dir = Path(self.save_dir).expand() # Check that data exists if self.grib is None: msg = f"🦨 GRIB2 file not found: {self.model=} {self.date=} {self.fxx=}" if errors == "warn": log.warning(msg) return # Can't download anything without a GRIB file URL. elif errors == "raise": raise ValueError(msg) if self.idx is None and searchString is not None: msg = f"🦨 Index file not found; cannot download subset: {self.model=} {self.date=} {self.fxx=}" if errors == "warn": log.warning( msg + " I will download the full file because I cannot subset." ) elif errors == "raise": raise ValueError(msg) if source is not None: # Force download from a specified source and not from first in priority self.grib = self.SOURCES[source] # Create directory if it doesn't exist if not outFile.parent.is_dir(): outFile.parent.mkdir(parents=True, exist_ok=True) print(f"👨🏻‍🏭 Created directory: [{outFile.parent}]") if searchString in [None, ":"] or self.idx is None: # Download the full file from remote source urllib.request.urlretrieve(self.grib, outFile, _reporthook) self.grib = outFile # self.grib_source = "local" if verbose: print( f"✅ Success! Downloaded {self.model.upper()} from \033[38;5;202m{self.grib_source:20s}\033[0m\n\tsrc: {self.grib}\n\tdst: {outFile}" ) else: # Download a subset of the file subset(searchString, outFile) return outFile
[docs] def xarray( self, searchString=None, backend_kwargs={}, remove_grib=True, **download_kwargs, ): """ Open GRIB2 data as xarray DataSet Parameters ---------- searchString : str Variables to read into xarray Dataset remove_grib : bool If True, grib file will be removed ONLY IF it didn't exist before we downloaded it. """ download_kwargs = {**dict(overwrite=False), **download_kwargs} # Download file if local file does not exists local_file = self.get_localFilePath(searchString=searchString) # ! \/ This is critical... # Only remove file if it did n0t exists before we download it remove_grib = not local_file.exists() and remove_grib # ! \/ Fail-safe; Never remove a file if the source is 'local' if self.grib_source == "local": remove_grib = False if not local_file.exists() or download_kwargs["overwrite"]: self.download(searchString=searchString, **download_kwargs) # Backend kwargs for cfgrib backend_kwargs.setdefault("indexpath", "") backend_kwargs.setdefault( "read_keys", ["parameterName", "parameterUnits", "stepRange"] ) backend_kwargs.setdefault("errors", "raise") # Use cfgrib.open_datasets, just in case there are multiple "hypercubes" # for what we requested. Hxr = cfgrib.open_datasets( local_file, backend_kwargs=backend_kwargs, ) # Get CF grid projection information with pygrib and pyproj because # this is something cfgrib doesn't do (https://github.com/ecmwf/cfgrib/issues/251) # NOTE: Assumes the projection is the same for all variables grib = pygrib.open(str(local_file)) msg = grib.message(1) cf_params = CRS(msg.projparams).to_cf() # Funny stuff with polar stereographic (https://github.com/pyproj4/pyproj/issues/856) # TODO: Is there a better way to handle this? What about south pole? if cf_params["grid_mapping_name"] == "polar_stereographic": cf_params["latitude_of_projection_origin"] = cf_params.get( "latitude_of_projection_origin", 90 ) # Here I'm looping over each dataset in the list returned by cfgrib for ds in Hxr: # Add some details # ---------------- ds.attrs["model"] = self.model ds.attrs["product"] = self.product ds.attrs["description"] = self.DESCRIPTION ds.attrs["remote_grib"] = self.grib ds.attrs["local_grib"] = local_file ds.attrs["searchString"] = searchString # Attach CF grid mapping # ---------------------- # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#appendix-grid-mappings ds["gribfile_projection"] = None ds["gribfile_projection"].attrs = cf_params ds["gribfile_projection"].attrs[ "long_name" ] = f"{self.model.upper()} model grid projection" # Assign this grid_mapping for all variables for var in list(ds): if var == "gribfile_projection": continue ds[var].attrs["grid_mapping"] = "gribfile_projection" # ! DO NOT REMOVE GRIB FILES IF THE SOURCE IS LOCAL # ! (I know I already checked this; I am just so worried about erasing my local data) if self.grib_source != "local": if remove_grib: # Load the datasets into memory before removing the file Hxr = [ds.load() for ds in Hxr] _ = [ds.close() for ds in Hxr] # TODO: # Forcefully close the files so it can be removed # (this is a WindowsOS specific requirement). # os.close(?WHAT IS THE FILE HANDLER?) """ https://docs.python.org/3/library/os.html#os.remove On Windows, attempting to remove a file that is in use causes an exception to be raised; on Unix, the directory entry is removed but the storage allocated to the file is not made available until the original file is no longer in use. >> HOW DO I COMPLETELY CLOSE THE FILE OPENED BY CFGRIB?? """ if not sys.platform == "win32": # Removes file local_file.unlink() else: warnings.warn("sorry, on windows I couldn't remove the file.") if len(Hxr) == 1: return Hxr[0] else: # cfgrib returned multiple hypercubes. try: # Handle case where HRRR subh returns multiple hypercubes (see #73) data_vars = set(itertools.chain(*[list(i) for i in Hxr])) data_vars.remove("gribfile_projection") Hxr = xr.concat(Hxr, dim="step", data_vars=data_vars) except: print( f"Note: Returning a list of [{len(Hxr)}] xarray.Datasets because cfgrib opened with multiple hypercubes." ) return Hxr