Source code for mfobs.obs

"""
General functions for creating observation input for PEST, 
including base observations representing field measurements and simulated equivalents, 
and derivative observations that are created from the base observations.
"""
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix
from mfobs.fileio import write_insfile, get_insfile_observations
from mfobs.sep import ih_method
from mfobs.utils import fill_nats, set_period_start_end_dates


[docs] def get_base_obs(perioddata, model_output, observed_values_file, variable_name=None, simulated_values_column='sim_obsval', observed_values_metadata_file=None, observed_values_site_id_col='obsprefix', observed_values_datetime_col='datetime', obsnme_date_suffix=True, obsnme_suffix_format='%Y%m', observed_values_obsval_col='obsval', observed_values_group_column='obgnme', observed_values_unc_column='uncertainty', aggregrate_observed_values_method='mean', drop_groups=None, label_period_as_steady_state=None, steady_state_period_start=None, steady_state_period_end=None, forecast_sites=None, forecast_start_date=None, forecast_end_date=None, forecasts_only=False, forecast_sites_only=False, outfile=None, verbose=True, write_ins=False): """Get a set of base observations from a tables of model output, observed values, and model time discretizaton. Parameters ---------- perioddata : str, pathlike or DataFrame Path to csv file or pandas DataFrame with start/end dates for stress periods or timesteps. Must have a `start_datetime` and either a `time` column (of elapsed times at the end of each period, in days), or an `end_datetime` column of period end dates. =================== ======================================================= start_datetime start date for each stress period or timestep end_datetime (optional) end date for each stress period or timestep time (optional) elapsed simulation time at the end of each period, in days =================== ======================================================= model_output : DataFrame DataFrame with one head observation per row, with the following columns: ======================= ============================================================= site_no unique identifier for each site variable variable name for the simulated values obsprefix prefix of observation name (<site_no>-<variable>) simulated_values_column column with simulated values datetime pandas datetimes, based on stress period end date layer zero-based model layer obsnme observation name based on format of <obsprefix>_'%Y%m' ======================= ============================================================= Usually, a helper function such as :func:`mfobs.modflow.get_mf6_single_variable_obs` or other custom code will be used to produce model_output from the raw model output files. Observation prefixes in the `obsprefix` column are assumed to be formated as **<site_no>-<variable>**, where `site_no` is a unique identifier for a site contained in `observed_values_file` and `variable` matches the `variable_name` argument to this function (and is the variable name for the observed values in `observed_values_file`). All observation prefixes are assumed to be case-insensitive (obsprefixes in the model_output and obseved_values_files tables are all converted to lower case prior to matching). observed_values_file : str, pathlike or DataFrame Path to csv file or pandas DataFrame with observed values. Must have the following columns (column names in brackets <> are supplied with other arguments to this function): ============================== ================================================================================ <observed_values_obsval_col> Column with observed values <observed_values_site_id_col> Column with unique identifier for each site <observed_values_datetime_col> Column with date/time for each observation <observed_values_unc_column> Column with estimated uncertainty values for each observation ============================== ================================================================================ variable_name : str, optional Name for the type of values being processed, e.g. 'head', 'downstream-flow' or 'stage'. Required if the `obsprefix` values in the `model_output` table are formatted as **<site number>-<variable>**; if supplied, the observation names created for the observations in `observed_values_file` will also have **<site number>-<variable>** prefixes, and the observed values and simulated equivalents can be aligned. By default, None, in which case the `obsprefix` values in the `model_output` table are assumed to be the site numbers; prefixes for observations in `observed_values_file` will also then be site numbers. simulated_values_column : str, optional Column in model output with simulated values observed_values_metadata_file : str, pathlike or DataFrame, optional Table with site information for the observed values. observed_values_obsval_col : str, optional Column in obs_values_file with measured values observed_values_site_id_col : str Column with unique identifier for each site, by default, 'obsprefix' observed_values_datetime_col : str Column with date/time for each observation, by default, 'datetime' observed_values_unc_column : str Column with estimated uncertainty values for each observation, by default, 'uncertainty' aggregrate_observed_values_method : str Method for aggregating observed values to time periods bracked by aggregate_start_dates and aggregate_end_dates. Can be any method used to aggregate values on a pandas groupby object (e.g. 'mean', 'last', etc.) obsnme_date_suffix : bool If true, give observations a date-based suffix. Otherwise, assign a stress period- or timestep-based suffix. In either case, the format of the suffix is controlled by obsnme_suffix_format. by default True obsnme_suffix_format : str, optional Format for suffix of obsnmes. Observation names are created following the format of <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``, which would yield ``'202001'`` for a Jan, 2020 observation (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format should be a decimal format in the "new-style" string format (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.) label_period_as_steady_state : int, optional Zero-based model stress period where observations will be assigned the suffix 'ss' instead of a date suffix. By default, None, in which case all model output is assigned a date suffix based on the start date of the stress period. Passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs`. steady_state_period_start : str, optional Start date for the period representing steady-state conditions. Observations between ``steady_state_period_start`` and ``steady_state_period_end`` will be averaged to create additional observations with the suffix 'ss'. The steady-state averages will be matched to model output from the stress period specified by ``label_period_as_steady_state``. By default None, in which case all observations are used (if ``observed_values_datetime_col is None``) or no steady-state observations are created (``observed_values_datetime_col`` exists). steady_state_period_end : str, optional End date for the period representing steady-state conditions. By default None, in which case all observations are used (if ``observed_values_datetime_col is None``) or no steady-state observations are created (``observed_values_datetime_col`` exists). forecast_sites : str or sequence, optional At these sites, observations will be created for each simulated value, regardless is there is an observed equivalent. Can be supplied as a sequence of site numbers (`site_id`s) or ``'all'`` to include all sites. By default, None (no forecast sites). forecast_start_date : str, optional Start date for forecast period. When forecast_sites is not ``None``, forecast observations will be generated for each time between `forecast_start_date` and `forecast_end_date`. By default, None (generate forecasts for any time with missing values). forecast_end_date : str, optional End date for forecast period. When forecast_sites is not ``None``, forecast observations will be generated for each time between `forecast_start_date` and `forecast_end_date`. By default, None (generate forecasts for any time with missing values). forecasts_only : bool, optional Use this option to only output forecast observations (those without an observed equivalent), subject to the parameters of `forecast_sites`, `forecast_start_date`, and `forecast_end_date`. forecast_sites_only : bool, optional Option to only output observations at sites specified with `forecast_sites` (has no effect if ``forecast_sites='all'``). If ``forecasts_only=False``, the output will include forecast and non-forecast observations (for a continuous time-series). verbose : bool Controls the verbosity of screen outputs. If True, warnings will be printed when there are no observations within a stress period and forecast_sites is None, (meaning no observation results will be processed for that stress period), or when supplied observations fall outside of the model timeframe. Within a workflow, verbose can be set equal to write_ins for initial debugging, and the subsequently set False during history matching runs. by default, True outfile : str, optional [description], by default 'processed_flux_obs.dat' write_ins : bool, optional [description], by default False Returns ------- base_obs : DataFrame Table of base observations with the following columns: =================== ============================================================= datetime pandas datetimes, based on stress period end date per MODFLOW stress period site_no unique site identifier obsprefix prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix> obsval observed values sim_obsval simulated equivalents to observed values obgnme observation group name =================== ============================================================= """ # validation checks check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, function_name='get_base_obs') outpath = Path('.') if outfile is not None: outpath = Path(outfile).parent #obs_values_column = 'obsval' output_sim_values_column = 'sim_obsval' output_obs_values_column = 'obsval' #if variable_name is None: # obs_values_column = 'obsval' #'obs_value' # #sim_values_column = 'sim_value' #else: # obs_values_column = 'obs_' + variable_name # output column with observed values # #sim_values_column = 'sim_' + variable_name # output column with simulated equivalents to observed values # read in/set up the perioddata table if not isinstance(perioddata, pd.DataFrame): perioddata = pd.read_csv(perioddata) else: perioddata = perioddata.copy() set_period_start_end_dates(perioddata) #perioddata.index = perioddata.per # model results results = model_output.copy() results['obsprefix'] = results['obsprefix'].str.lower() # rename columns to their defaults renames = {observed_values_site_id_col: 'site_no', observed_values_datetime_col: 'datetime', observed_values_group_column: 'obgnme', observed_values_unc_column: 'uncertainty' } # read in/set up the observed values table if not isinstance(observed_values_file, pd.DataFrame): observed = pd.read_csv(observed_values_file, dtype={observed_values_site_id_col: object}) else: observed = observed_values_file observed.rename(columns=renames, inplace=True) if 'obsprefix' not in observed.columns: if variable_name is not None: observed['obsprefix'] = [f"{sn}-{variable_name}" for sn in observed['site_no']] if forecast_sites is not None and forecast_sites != 'all': forecast_sites = [f"{sn}-{variable_name}" for sn in forecast_sites] else: observed['obsprefix'] = observed['site_no'].astype(str) else: observed['obsprefix'] = observed['obsprefix'].astype(str) # read in the observed values metadata if observed_values_metadata_file is not None: if not isinstance(observed_values_metadata_file, pd.DataFrame): metadata = pd.read_csv(observed_values_metadata_file, dtype={observed_values_site_id_col: object}) else: metadata = observed_values_metadata_file metadata.rename(columns=renames, inplace=True) if 'obsprefix' not in metadata.columns: metadata['obsprefix'] = metadata[observed_values_site_id_col] # join the metadata to the observed data metadata.index = metadata['obsprefix'].values observed.index = observed['obsprefix'].values join_cols = [c for c in ['site_no', 'screen_top', 'screen_botm', 'x', 'y', 'layer'] if c in metadata.columns] observed = observed.join(metadata[join_cols]) # convert obs names and prefixes to lower case observed['obsprefix'] = observed['obsprefix'].str.lower() # make a dictionary of site metadata for possible use later temp = observed.copy() temp.index = temp['obsprefix'].str.lower() site_info_dict = temp.to_dict() for col in 'datetime', 'obsval', observed_values_obsval_col: if col in site_info_dict: del site_info_dict[col] del temp # cast datetimes to pandas datetimes # (observed data may not have a datetime col. if model is steady-state) if observed_values_datetime_col is not None and 'datetime' in observed.columns: try: observed['datetime'] = pd.to_datetime(observed['datetime']) except ValueError: observed['datetime'] = pd.to_datetime(observed['datetime'], format='mixed') # not necessarily True observed['steady'] = False # flag for steady-state observations elif label_period_as_steady_state is not None: observed['datetime'] = pd.to_datetime( perioddata['start_datetime'][label_period_as_steady_state]) observed['steady'] = True else: if len(perioddata) == 1: observed['datetime'] = pd.to_datetime(perioddata['start_datetime'][0]) observed['steady'] = True else: raise ValueError("Model has more than one stress period " "but observed data have no 'datetime' column.") # drop model results that aren't in the obs information file # these are probably observations that aren't in the model time period # (and therefore weren't included in the parent model calibration; # but modflow-setup would include them in the MODFLOW observation input) # also drop sites that are in the obs information file, but not in the model results # these include sites outside of the model (i.e. in the inset when looking at the parent) no_info_sites = set(results.obsprefix).symmetric_difference(observed.obsprefix) if forecast_sites == 'all': # forecast observations at all simulated sites # (only drop sites that aren't simulated) no_info_sites = set(observed.obsprefix).difference(results.obsprefix) elif forecast_sites is not None: # remove selected forecast sites from 'no_info' sites to drop forecast_sites = {s.lower() for s in forecast_sites} no_info_sites = no_info_sites.difference(forecast_sites) # dump these out to a csv print('Dropping {} sites with no information'.format(len(no_info_sites))) dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv' results.loc[results.obsprefix.isin(no_info_sites)].to_csv(dropped_obs_outfile, index=False) results = results.loc[~results.obsprefix.isin(no_info_sites)].copy() observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy() if len(results) == 0: raise ValueError("No matches between observation prefixes in the model results" "and observed values! Check that the obsprefixes in the " "observed_values_file match those in the model_output DataFrame.") # for each model stress period or timestep, get the simulated values # and the observed equivalents observed.index = pd.to_datetime(observed.datetime) observed.sort_index(inplace=True) results.index = pd.to_datetime(results.datetime) results.sort_index(inplace=True) # integer column for stress period- or timestep-based obsnme suffixes # timestep-based observations if 'timestep' in perioddata.columns: perioddata['unique_timestep'] = list(range(len(perioddata))) per_column = 'unique_timestep' # stress period-based observations else: per_column = 'per' observed_simulated_combined = [] # if no datetime column is supplied with observations, # only make steady state obs if observed_values_datetime_col is None and \ label_period_as_steady_state is not None: idx = label_period_as_steady_state perioddata = perioddata[idx:idx+1] for i, r in perioddata.iterrows(): # get the equivalent observed values #start, end = perioddata.loc[per, ['start_datetime', 'end_datetime']] start, end = r['start_datetime'], r['end_datetime'] # date-based suffix if obsnme_date_suffix: suffix = pd.Timestamp(end).strftime(obsnme_suffix_format) # stress or timestep-based period-based suffix else: suffix = f"{r[per_column]:{obsnme_suffix_format.strip('{:}')}}" # steady-state observations can represent a period # other than the "modflow time" in the perioddata table if r['per'] == label_period_as_steady_state: suffix = 'ss' if steady_state_period_start is not None: start = steady_state_period_start if steady_state_period_end is not None: end = steady_state_period_end # don't process observations for a steady-state period unless # it is explicitly labeled as such and given a representative date range elif r['steady']: continue observed_in_period_rs = aggregrate_to_period( observed, start, end, aggregrate_observed_values_method=aggregrate_observed_values_method, obsnme_suffix=suffix) if verbose and (forecast_sites is None) and (len(observed_in_period_rs) == 0): if per_column == 'per': warnings.warn(('Stress period {}: No observations between start and ' 'end dates of {} and {}!'.format(r['per'], start, end))) continue # get the simulated equivalents # first populate obsnmes sim_in_period_rs = aggregrate_to_period( results, start, end, aggregrate_observed_values_method=aggregrate_observed_values_method, obsnme_suffix=suffix) if verbose and (sim_in_period_rs is None) or (len(sim_in_period_rs) == 0): if per_column == 'per': warnings.warn(('Stress period {}: No simulated equivalents between start and ' 'end dates of {} and {}!'.format(r['per'], start, end))) continue if forecast_sites is not None: observed_in_period_rs = observed_in_period_rs.reindex(sim_in_period_rs['obsnme'].values, axis=0) obsprefix = observed_in_period_rs.index.str.split('_', expand=True).levels[0] observed_in_period_rs['obsprefix'] = obsprefix observed_in_period_rs['obsnme'] = observed_in_period_rs.index observed_in_period_rs['datetime'] = sim_in_period_rs['datetime'].values[0] any_simulated_obs = sim_in_period_rs.obsnme.isin(observed_in_period_rs.obsnme).any() if verbose and not any_simulated_obs and (per_column == 'per'): warnings.warn(('Stress period {}: No observation/simulated equivalent pairs for start and ' 'end dates of {} and {}!'.format(r['per'], start, end))) continue observed_in_period_rs[output_sim_values_column] = sim_in_period_rs.reindex(observed_in_period_rs.index)[simulated_values_column] # add stress period and observed values observed_in_period_rs['per'] = r['per'] if per_column == 'unique_timestep': observed_in_period_rs['timestep'] = r['timestep'] observed_in_period_rs['unique_timestep'] = r['unique_timestep'] observed_in_period_rs[output_obs_values_column] = observed_in_period_rs[observed_values_obsval_col] observed_simulated_combined.append(observed_in_period_rs) # Combined DataFrame of observed heads and simulated equivalents if len(observed_simulated_combined) == 0: msg = ("No overlap between model results " f"({results['datetime'].min():%Y-%m-%d} to {results['datetime'].max():%Y-%m-%d})" " and observed values " f"({observed['datetime'].min():%Y-%m-%d} to {observed['datetime'].max():%Y-%m-%d})!" ) raise ValueError(msg) obsdata = pd.concat(observed_simulated_combined) # raise an error if there are duplicates- reindexing below will fail if this is the case if obsdata.index.duplicated().any(): msg = ('The following observations have duplicate names. There should only be' 'one observation per site, for each time period implied by the ' 'obsnme_date_suffix_format parameter.\n{}' .format(obsdata.loc[obsdata.duplicated()])) raise ValueError(msg) # drop any observations in specified groups # (e.g. lake stages that should be compared with lake package output) if drop_groups is not None and 'obgnme' in obsdata.columns: obsdata = obsdata.loc[~obsdata.obgnme.isin(drop_groups)].copy() if 'obgnme' not in obsdata.columns: obsdata['obgnme'] = variable_name # fill forecast obs with site info from observed dataframe if forecast_sites is not None: for k, v in site_info_dict.items(): obsdata[k] = [v.get(p, current) for p, current in zip(obsdata['obsprefix'], obsdata[k])] obsdata['obsnme'] = obsdata.index else: # nans are where sites don't have observation values for that period # or sites that are in other model (inset or parent) obsdata.dropna(subset=[output_obs_values_column], axis=0, inplace=True) # label forecasts in own group if forecast_sites is not None: is_forecast = obsdata[output_obs_values_column].isna() obsdata.loc[is_forecast, 'obgnme'] += '-forecast' # cull forecasts to specified date window # and specific sites (if specified) keep_forecasts = is_forecast.copy() #np.array([True] * len(head_obs)) if forecast_start_date is not None: keep_forecasts = (obsdata['datetime'] >= forecast_start_date) if forecast_end_date is not None: keep_forecasts &= (obsdata['datetime'] <= forecast_end_date) #drop = drop & is_forecast #head_obs = head_obs.loc[~drop].copy() #is_forecast = head_obs[output_obs_values_column].isna() if forecast_sites != 'all': keep_forecasts &= obsdata['obsprefix'].isin(forecast_sites) # option to only include forecast obs # (those without observed equivalents) if forecasts_only: keep = keep_forecasts else: keep = keep_forecasts | ~is_forecast # option to only include output from designated forecast sites if forecast_sites_only: keep = keep & obsdata['obsprefix'].isin(forecast_sites) obsdata = obsdata.loc[keep].copy() # add standard obsval and obgmne columns columns = ['datetime', 'per', 'site_no', 'obsprefix', 'obsnme', output_obs_values_column, output_sim_values_column, 'uncertainty', 'obgnme'] if output_obs_values_column != 'obsval': obsdata['obsval'] = obsdata[output_obs_values_column] columns.insert(-1, 'obsval') # reorder the columns columns = [c for c in columns if c in obsdata.columns] base_obs = obsdata[columns].copy() if 'layer' in columns: base_obs['layer'] = base_obs['layer'].astype(int) # fill NaT (not a time) datetimes fill_nats(base_obs, perioddata) base_obs.sort_values(by=['obsprefix', 'per'], inplace=True) if outfile is not None: base_obs.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(base_obs):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(base_obs, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column=output_sim_values_column, index=False) return base_obs
[docs] def aggregrate_to_period(data, start, end, obsnme_suffix, aggregrate_observed_values_method='mean', ): data_in_period = data.loc[start:end].reset_index(drop=True) if (len(data_in_period) == 0): cols = list(data.columns) + ['obsnme'] data_in_period_rs = pd.DataFrame(columns=cols) data_in_period_rs.index = data_in_period_rs['obsnme'] return data_in_period_rs data_in_period.sort_values(by=['obsprefix', 'datetime'], inplace=True) if 'n' not in data_in_period.columns: data_in_period['n'] = 1 by_site = data_in_period.groupby('obsprefix') data_in_period_rs = by_site.first() data_cols = [c for c, dtype in data_in_period.dtypes.items() if 'float' in dtype.name] for c in data_cols: data_in_period_rs[c] = getattr(by_site[c], aggregrate_observed_values_method)() data_in_period_rs['n'] = by_site.n.sum() data_in_period_rs['datetime'] = pd.Timestamp(end) data_in_period_rs.reset_index(inplace=True) # put obsprefix back missing_cols = set(data_in_period.columns).difference(data_in_period_rs.columns) for col in missing_cols: data_in_period_rs[col] = by_site[col].first().values data_in_period_rs = data_in_period_rs[data_in_period.columns] obsnames = ['{}_{}'.format(prefix.lower(), obsnme_suffix) for prefix in data_in_period_rs.obsprefix] data_in_period_rs['obsnme'] = obsnames data_in_period_rs.index = data_in_period_rs['obsnme'] return data_in_period_rs
[docs] def get_spatial_differences(base_data, perioddata, difference_sites, obs_values_col='obs_head', sim_values_col='sim_head', use_gradients=False, sep='-d-', write_ins=False, outfile=None): r"""Takes the base_data dataframe output by :func:`mfobs.obs.get_obs` and creates spatial difference observations. Optionally writes an output csvfile and a PEST instruction file. Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns: =================== ======================================================================== datetime pandas datetimes, based on stress period end date per MODFLOW stress period site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix> obsval observed values sim_obsval simulated equivalents to observed values obgnme observation group name screen_top (optional) well open interval top; required if ``use_gradients=True`` screen_botm (optional) well open interval bottom; required if ``use_gradients=True`` =================== ======================================================================== 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> perioddata : DataFrame DataFrame with start/end dates for stress periods. Must have columns 'time' (modflow time, in days), 'start_datetime' (start date for the stress period) and 'end_datetime' (end date for the stress period). difference_sites : dict Dictionary of site numbers (keys) and other site numbers to compare to (values). Values can be a string for a single site, a list of strings for multiple sites, or a string pattern contained in multiple site numbers; observations at the sites represented in the values will be compared to the observation at the site represented by the key, at times of coincident measurements. Differences are computed by subtracting the values site(s) from the key site, so for example, to represent a gain in streamflow as positive, the downstream site should be key site. obs_values_col : str Column in ``base_data`` with observed values sim_values_col : str Column in `base_data`` with simulated equivalent values use_gradients : bool If True, compute vertical hydraulic gradients and use those for the observation values, if False, use differences. For this option, 'screen_top' and 'screen_botm' columns are needed in ``base_data``. By default False. sep : str Separator in spatial difference obsnnames. For example, with sites "site1" and "site2" at time "202001", and sep='-d-', the obsnme would be "site1-d-site2_202001". by default, '-d-' outfile : str, optional CSV file to write output to. Nan values are filled with -1e30. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ======= spatial_differences : DataFrame Spatial difference observations. Columns: ================= =================================================================================== datetime observation date-time labels per model stress period obsprefix observation name prefix (site identifier) obsnme1 name of observation from keys of ``difference_sites`` <obs_values_col>1 observed value associated with obsnme1 <sim_values_col>1 simulated equivalent associated with obsnme1 screen_top1 well screen top (elevation) associated with obsnme1* screen_botm1 well screen botm (elevation) associated with obsnme1* layer1 model layer associated with obsnme1* obsnme2 name of observation from value(s) in ``difference_sites`` (associated with obsnme1) <obs_values_col>2 observed value associated with obsnme2 <sim_values_col>2 simulated equivalent associated with obsnme2 screen_top2 well screen top (elevation) associated with obsnme2* screen_botm2 well screen botm (elevation) associated with obsnme2* layer2 model layer associated with obsnme2* obs_diff observed difference between obsnme1 and obsnme2 sim_diff simulated equivalent difference between obsnme1 and obsnme2 dz distance between well screen midpoints for obsnme1 and obsnme2* obs_grad observed vertical hydraulic gradient between obsnme1 and obsnme2* sim_grad simulated equivalent vertical hydraulic gradient between obsnme1 and obsnme2* obgnme observation group obsnme spatial difference observation name obsval observation value (i.e. for PEST control file) sim_obsval simulated equivalent (i.e. for PEST instruction file) type description of spatial difference observations uncertainty (loosely) error-based uncertainty, assumed to be 2x that of obsnme2 ================= =================================================================================== Notes: * * denotes optional columns that may not be present. * Columns relating to well open interval are only created if ``base_data`` has 'screen_top' and 'screen_botm' columns. * Negative difference or gradient values indicate a gradient towards the key site. """ # copy the base data to prevent unintended side-effects base_data = base_data.copy() # model stress period data: perioddata = perioddata.copy() # make sure start and end dates don't overlap set_period_start_end_dates(perioddata) perioddata.index = perioddata.per # rename the observed and sim. eq. values columns #if variable is not None: # renames = {obs_values_col: f'obs_{variable}', # sim_values_col: f'sim_{variable}', # } # obs_values_col = f'obs_{variable}' # sim_values_col = f'sim_{variable}' #else: renames = {obs_values_col: 'base_obsval', # f'obs_{variable}', sim_values_col: 'base_sim_obsval', # f'sim_{variable}', } obs_values_col = f'base_obsval' sim_values_col = f'base_sim_obsval' base_data.drop([#f'obs_{variable}', f'sim_{variable}', 'base_obsval', 'bas_sim_obsval'], axis=1, inplace=True, errors='ignore') base_data.rename(columns=renames, inplace=True) # rename the observed and sim. eq. values columns #renames = {obs_values_col: f'obs_{variable}', # sim_values_col: f'sim_{variable}', # } #base_data.drop([f'obs_{variable}', f'sim_{variable}'], axis=1, # inplace=True, errors='ignore') #base_data.rename(columns=renames, inplace=True) #obs_values_col = f'obs_{variable}' #sim_values_col = f'sim_{variable}' # get a set of unique variables # (including '', which signifies no variable in the obsprefix, # just a site number) variables = {s.split('-')[1] if len(s.split('-')) > 1 else '' for s in base_data['obsprefix']} # get subset of base_data sites to compare to each key site in difference_sites base_data_obsprefixes = set(base_data['obsprefix']) groups = base_data.groupby('obsprefix') spatial_differences = [] # key site: value/comparison sites can be include variables or not # if they don't, and the base_data includes variables, # process each obsprefix with that site number for variable in variables: for key_obsprefix, patterns in difference_sites.items(): if len(variable) > 0: key_obsprefix = f"{key_obsprefix.split('-')[0]}-{variable}" if key_obsprefix not in base_data_obsprefixes: print((f'warning: observation prefix {key_obsprefix} not in base_data. ' 'Skipping spatial differencing.')) continue compare = [] if isinstance(patterns, str): patterns = [patterns] # matching obsprefixes must have the pattern and the variable # if obsprefixes are simply formatted as site numbers # (no variables) # then the only variable will be an empty string, # which will match any string for pattern in patterns: matches = [True if pattern in obsprefix and variable in obsprefix else False for obsprefix in base_data.obsprefix] compare.append(matches) compare = np.any(compare, axis=0) prefixes = set(base_data.loc[compare, 'obsprefix']) # for each site in the subset, compare the values to the keys site # index by stress period key_values = groups.get_group(key_obsprefix).copy() key_values.index = key_values.per for obsprefix, site_observations in groups: if obsprefix in prefixes: site_obs = site_observations.copy() site_obs.rename(columns={obs_values_col: f"{obs_values_col}2", # 'obs_head2', sim_values_col: f"{sim_values_col}2", # 'sim_head2', 'obsnme': 'obsnme2', 'screen_top': 'screen_top2', 'screen_botm': 'screen_botm2', 'layer': 'layer2' }, inplace=True) site_obs.index = site_obs.per site_obs['obsnme1'] = key_values['obsnme'] site_obs[f"{obs_values_col}1"] = key_values[obs_values_col] site_obs[f"{sim_values_col}1"] = key_values[sim_values_col] if 'screen_top' in key_values.columns: site_obs['screen_top1'] = key_values['screen_top'] if use_gradients and 'screen_botm' not in key_values.columns: raise ValueError("use_gradients=True requires both screen_top " "and screen_botm columns in base_data") if 'screen_botm' in key_values.columns: site_obs['screen_botm1'] = key_values['screen_botm'] if 'layer2' in site_obs.columns: site_obs['layer1'] = key_values['layer'] # negative values indicate gradient towards key site # (key site head < values site head) site_obs['obs_diff'] = site_obs[f"{obs_values_col}1"] - site_obs[f"{obs_values_col}2"] site_obs['sim_diff'] = site_obs[f"{sim_values_col}1"] - site_obs[f"{sim_values_col}2"] # get a screen midpoint and add gradient screen_midpoint1 = None if use_gradients and {'screen_top1', 'screen_botm1'}.intersection(site_obs.columns): screen_midpoint1 = site_obs[['screen_top1', 'screen_botm1']].mean(axis=1) if use_gradients and {'screen_top2', 'screen_botm2'}.intersection(site_obs.columns): screen_midpoint2 = site_obs[['screen_top2', 'screen_botm2']].mean(axis=1) if screen_midpoint1 is not None: site_obs['dz'] = (screen_midpoint1 - screen_midpoint2) site_obs['obs_grad'] = site_obs['obs_diff'] / site_obs['dz'] site_obs['sim_grad'] = site_obs['sim_diff'] / site_obs['dz'] # default obgnme prefix if 'obgnme' not in site_obs.columns: if len(variable) == 0: site_obs['obgnme'] = 'sdiff' else: site_obs['obgnme'] = variable # obgnme = <prefix>_sdiff # unless there is no prefix site_obs['obgnme'] = [f'{g}_sdiff' if g != 'sdiff' else g for g in site_obs['obgnme']] # whether to use gradients for the obsvals, or just head differences if use_gradients: site_obs['obsval'] = site_obs['obs_grad'] site_obs['sim_obsval'] = site_obs['sim_grad'] if len(variable) == 0: site_obs['type'] = f'gradient' else: site_obs['type'] = f'{variable} gradient' else: site_obs['obsval'] = site_obs['obs_diff'] site_obs['sim_obsval'] = site_obs['sim_diff'] if len(variable) == 0: site_obs['type'] = f'spatial difference' else: site_obs['type'] = f'spatial {variable} difference' spatial_differences.append(site_obs) if len(spatial_differences) == 0: raise ValueError('No spatial difference site/variable pairs found! ' 'Check that the key sites and their compare patterns exist in the base_data.') spatial_differences = pd.concat(spatial_differences) spatial_differences.dropna(subset=['obs_diff', 'sim_diff'], axis=0, inplace=True) # name the spatial head difference obs as # <obsprefix1><sep><obsprefix2>_<suffix> obsnme = [] obsprefix = [] for i, r in spatial_differences.iterrows(): prefix1, suffix1 = r.obsnme1.split('_') prefix2, suffix2 = r.obsnme2.split('_') assert suffix1 == suffix2, "Observations are at different times! {}, {}".format(r.obsnme1, r.obsnme2) # if the obsprefixes include variables; only retain the the second instance prefix = f"{prefix1.split('-')[0]}{sep}{prefix2}" obsnme.append('{}_{}'.format(prefix, suffix2)) obsprefix.append(prefix) spatial_differences['obsnme'] = obsnme spatial_differences['obsprefix'] = obsprefix # clean up columns cols = ['datetime', 'per', 'obsprefix', 'obsnme1', f"{obs_values_col}1", f"{sim_values_col}1", 'screen_top1', 'screen_botm1', 'layer1', 'obsnme2', f"{obs_values_col}2", f"{sim_values_col}2", 'screen_top2', 'screen_botm2', 'layer2', 'obs_diff', 'sim_diff', 'dz', 'obs_grad', 'sim_grad', 'obsnme', 'obsval', 'sim_obsval', 'obgnme', 'type' ] cols = [c for c in cols if c in spatial_differences.columns] spatial_differences = spatial_differences[cols] spatial_differences.dropna(axis=0, subset=['obsval'], inplace=True) # uncertainty column is from base_data; # assume that spatial head differences have double the uncertainty # (two wells/two measurements per obs) if 'uncertainty' in spatial_differences.columns: spatial_differences['uncertainty'] *= 2 # check for duplicates assert not spatial_differences['obsnme'].duplicated().any() # fill NaT (not a time) datetimes fill_nats(spatial_differences, perioddata) if outfile is not None: spatial_differences.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(spatial_differences):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(spatial_differences, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return spatial_differences
[docs] def get_temporal_differences(base_data, perioddata, obs_values_col='obs_head', sim_values_col='sim_head', variable='head', get_displacements=False, displacement_from=None, obsnme_date_suffix=True, obsnme_suffix_format='%Y%m', exclude_suffix='ss', exclude_obs=None, outfile=None, write_ins=False): """Takes the base_data dataframe output by :func:`mfobs.obs.get_obs`, creates temporal difference observations. Optionally writes an output csvfile and a PEST instruction file. Parameters ---------- base_data : DataFrame Head observation data with same column structure as output from :func:`mfobs.obs.get_obs` perioddata : DataFrame DataFrame with start/end dates for stress periods. Must have columns 'time' (modflow time, in days), 'start_datetime' (start date for the stress period) and 'end_datetime' (end date for the stress period). obs_values_col : str Column in ``base_data`` with observed values sim_values_col : str Column in `base_data`` with simulated equivalent values variable : str {'head', 'flux', or other} Type of observation being processed. Simulated and observed values columns are named in the format 'sim_<variable>' and 'obs_<variable>', respectively. If there is no 'obgnme' column in ``base_data``, ``variable`` is also used as a default base group name. get_displacements : bool If True, compute the displacement of each observation from a datum (specified by ``displacement_from``). If False, difference each observation with the previous observation. by default, False displacement_from : str or date-like Datum for computing displacements. Must be in a format that can be used for time slicing in pandas (e.g. '2010-01-01', which would result in displacements from the first observation on or after '2010-01-01' at each site, or None, which would result in displacements from the first observation at each site. By default, None non-zero weighted observation obsnme_date_suffix : bool If true, give observations a date-based suffix. Otherwise, assign a stress period-based suffix. In either case, the format of the suffix is controlled by obsnme_suffix_format. by default True obsnme_suffix_format : str, optional Format for suffix of obsnmes. Observation names are created following the format of <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``, which would yield ``'202001'`` for a Jan, 2020 observation (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format should be a decimal format in the "new-style" string format (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.) exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' exclude_obs : list-like Sequence of observation names to exclude from return/written dataset. For example, if sequential head differences are also being computed, the first displacement observation after the reference observation will be a duplicate of the first sequential head difference observation. By default, None (no observations excluded). outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- period_diffs : DataFrame Notes ----- Differences are computed by subtracting the previous time from the current, so a positive value indicates an increase. """ base_data = base_data.copy() base_data['datetime'] = pd.to_datetime(base_data['datetime']) # validation checks check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, function_name='get_head_obs', obsdata=base_data) if base_data.columns.duplicated().any(): raise ValueError('Duplicate column names in base_data:\n' f'{base_data.columns[base_data.columns.duplicated(keep=False)]}') # rename the observed and sim. eq. values columns if variable is not None: renames = {obs_values_col: f'obs_{variable}', sim_values_col: f'sim_{variable}', } obs_values_col = f'obs_{variable}' sim_values_col = f'sim_{variable}' else: renames = {obs_values_col: 'base_obsval', # f'obs_{variable}', sim_values_col: 'base_sim_obsval', # f'sim_{variable}', } obs_values_col = f'base_obsval' sim_values_col = f'base_sim_obsval' base_data.drop([f'obs_{variable}', f'sim_{variable}', 'base_obsval', 'bas_sim_obsval'], axis=1, inplace=True, errors='ignore') base_data.rename(columns=renames, inplace=True) # rename the observed and sim. eq. values columns #renames = {obs_values_col: f'obs_{variable}', # sim_values_col: f'sim_{variable}', # } #base_data.drop([f'obs_{variable}', f'sim_{variable}'], axis=1, # inplace=True, errors='ignore') #base_data.rename(columns=renames, inplace=True) #obs_values_col = f'obs_{variable}' #sim_values_col = f'sim_{variable}' # only compute differences on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() # group observations by site (prefix) sites = base_data.groupby('obsprefix') period_diffs = [] # get index position for obsval columns, # after adding them if needed # (this allows assignment of new values purely by iloc, # which avoids setting on slices in loop below) if 'obsval' in base_data.columns: obsval_col_iloc = base_data.columns.get_loc('obsval') else: base_data['obsval'] = -1e30 obsval_col_iloc = len(base_data.columns) - 1 if 'sim_obsval' in base_data.columns: sim_obsval_col_iloc = base_data.columns.get_loc('sim_obsval') else: base_data['sim_obsval'] = -1e30 sim_obsval_col_iloc = len(base_data.columns) - 1 for site_no, values in sites: values = values.sort_values(by=['per']).copy() values.index = values['datetime'] # compute the differences if get_displacements: values = values.sort_index().loc[displacement_from:].copy() # some sites may not have any measurements # after displacement datum; skip these if len(values) <= 1: continue values['obsval'] = values[obs_values_col] - \ values[obs_values_col].iloc[0] values['sim_obsval'] = values[sim_values_col] - \ values[sim_values_col].iloc[0] # assign np.nan to starting displacements (of 0) # (so they get dropped later on, # consistent with workflow for sequential difference obs) values.iloc[0, obsval_col_iloc] = np.nan values.iloc[0, sim_obsval_col_iloc] = np.nan #values['obsval'].iloc[0] = np.nan #values['sim_obsval'].iloc[0] = np.nan else: values['obsval'] = values[obs_values_col].diff() values['sim_obsval'] = values[sim_values_col].diff() # name the temporal difference obs as # <obsprefix>_<obsname1 suffix>d<obsname2 suffix> # where the obsval = obsname2 - obsname1 obsnme = [] for i, (idx, r) in enumerate(values.iterrows()): obsname2_suffix = '' if i > 0: if get_displacements: obsname_2_loc = 0 else: obsname_2_loc = i - 1 # date-based suffixes if obsnme_date_suffix: obsname2_suffix = values.iloc[obsname_2_loc] \ ['datetime'].strftime(obsnme_suffix_format) # stress period-based suffixes else: per = values.iloc[obsname_2_loc]['per'] obsname2_suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}" obsnme.append('{}d{}'.format(r.obsnme, obsname2_suffix)) values['obsnme'] = obsnme # todo: is there a general uncertainty approach for temporal differences that makes sense? period_diffs.append(values) period_diffs = pd.concat(period_diffs).reset_index(drop=True) # some checks to make sure the assignments from the above loop are correct assert period_diffs.iloc[:, sim_obsval_col_iloc].name == 'sim_obsval' assert period_diffs.iloc[:, obsval_col_iloc].name == 'obsval' assert not np.any(period_diffs[['obsval', 'sim_obsval']] == -1e30) period_diffs['datetime'] = pd.to_datetime(period_diffs['datetime']) if 'obgnme' not in period_diffs.columns: period_diffs['obgnme'] = variable if get_displacements: period_diffs['type'] = f'{variable} displacement' period_diffs['obgnme'] = [f'{g}_disp' for g in period_diffs['obgnme']] else: period_diffs['type'] = f'temporal {variable} difference' period_diffs['obgnme'] = [f'{g}_tdiff' for g in period_diffs['obgnme']] # drop some columns that aren't really valid; if they exist period_diffs.drop(['n'], axis=1, inplace=True, errors='ignore') # clean up columns cols = ['datetime', 'per', 'obsprefix', 'obsnme', f'obs_{variable}', f'sim_{variable}', 'screen_top', 'screen_botm', 'layer', 'obsval', 'sim_obsval', 'obgnme', 'type'] cols = [c for c in cols if c in period_diffs.columns] period_diffs = period_diffs[cols].copy() # drop observations with no difference (first observations at each site) period_diffs.dropna(axis=0, subset=['sim_obsval'], inplace=True) # drop any excluded obs if exclude_obs is not None: exclude_obs = set(exclude_obs) print(f"dropping {len(exclude_obs)} observations specified with exclude_obs") period_diffs = period_diffs.loc[~period_diffs['obsnme'].isin(exclude_obs)].copy() # fill NaT (not a time) datetimes fill_nats(period_diffs, perioddata) if outfile is not None: period_diffs.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(period_diffs):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(period_diffs, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return period_diffs
[docs] def get_annual_means(base_data, obsnme_suffix_format='%Y', exclude_suffix='ss', obgnme_suffix='annual-mean', outfile=None, write_ins=False): r"""Create observations of annual mean values for a set of base observations. Means are computed for all columns with floating point data. Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns, in addition to columns of floating point data to aggregate (which can have any name): =================== ============================================================= datetime pandas datetimes, based on stress period end date site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name =================== ============================================================= 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> 2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`> obsnme_suffix_format : str, optional Format for suffix of obsnmes. By default '%Y', which returns the 4-digit year exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' obgnme_suffix : str Create new observation group names by appending this suffix to existing obgnmes, for example <existing obgnme>_<obgnme_suffix> by default, 'annual-mean' outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- aggregated : DataFrame With the following columns (in addition to the data columns in `base_data`, which now contain the aggregated values): =================== ============================================================= datetime year of annual mean, as datetime year year of annual mean, as integer site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name\ :sup:`3` =================== ============================================================= 1) With format <site_no>-<variable> or simply <site_no> 2) With suffix formatted with `obsnme_suffix_format` 3) With format of <original obgnme>_<obgnme_suffix> e.g. heads_annual-mean Notes ----- """ # validation checks #obsnme_date_suffix=True #check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, # function_name='get_head_obs', obsdata=base_data) base_data['datetime'] = pd.to_datetime(base_data['datetime']) # only compute statistics on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() # only include times where there are both an observation and sim. equivalent base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True) grouped = base_data.groupby([base_data['site_no'], base_data['datetime'].dt.year]) aggregated = grouped.first() data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name] for c in data_cols: aggregated[c] = grouped[c].mean() aggregated['n'] = grouped.count().iloc[:, 0] aggregated.index.set_names(['site_no', 'year'], inplace=True) aggregated.reset_index(inplace=True) aggregated.sort_values(by=['site_no', 'year'], inplace=True) aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}" for prefix, dt in zip(aggregated['obsprefix'], aggregated['datetime'])] aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}" for prefix in aggregated['obgnme']] cols = ['datetime', 'year', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n'] aggregated = aggregated[cols] if outfile is not None: aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(aggregated):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(aggregated, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return aggregated
[docs] def get_monthly_means(base_data, obsnme_suffix_format='%Y%m', exclude_suffix='ss', obgnme_suffix='monthly-mean', outfile=None, write_ins=False): r"""Create observations of monthly mean values for a set of base observations. Means are computed for all columns with floating point data. Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns, in addition to columns of floating point data to aggregate (which can have any name): =================== ============================================================= datetime pandas datetimes, based on stress period end date site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name =================== ============================================================= 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> 2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`> obsnme_suffix_format : str, optional Format for suffix of obsnmes. By default '%Y%m', which returns the year and month as consecutive integers, e.g. 200101 for Jan, 2001. exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' obgnme_suffix : str Create new observation group names by appending this suffix to existing obgnmes, for example <existing obgnme>_<obgnme_suffix> by default, 'annual-mean' outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- aggregated : DataFrame With the following columns (in addition to the data columns in `base_data`, which now contain the aggregated values): =================== ============================================================= datetime aggregated dates as datetime objects site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name\ :sup:`3` =================== ============================================================= 1) With format <site_no>-<variable> or simply <site_no> 2) With suffix formatted with `obsnme_suffix_format` 3) With format of <original obgnme>_<obgnme_suffix> e.g. heads_annual-mean Notes ----- """ # validation checks #obsnme_date_suffix=True #check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, # function_name='get_head_obs', obsdata=base_data) base_data['datetime'] = pd.to_datetime(base_data['datetime']) # only compute statistics on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() # only include times where there are both an observation and sim. equivalent base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True) grouped = base_data.groupby([base_data['site_no'], base_data['datetime'].dt.year, base_data['datetime'].dt.month]) aggregated = grouped.first() data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name] for c in data_cols: aggregated[c] = grouped[c].mean() aggregated['n'] = grouped.count().iloc[:, 0] aggregated.index.set_names(['site_no', 'year', 'month'], inplace=True) aggregated.reset_index(inplace=True) aggregated.sort_values(by=['site_no', 'year', 'month'], inplace=True) aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}" for prefix, dt in zip(aggregated['obsprefix'], aggregated['datetime'])] aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}" for prefix in aggregated['obgnme']] cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n'] aggregated = aggregated[cols] if outfile is not None: aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(aggregated):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(aggregated, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return aggregated
[docs] def get_mean_monthly(base_data, obsnme_suffix_format='%b', exclude_suffix='ss', obgnme_suffix='mean-monthly', outfile=None, write_ins=False): r"""Create observations of mean monthly, or means for months of the year (for example, the mean for Jan. across all years). Means are computed for all columns with floating point data. Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns, in addition to columns of floating point data to aggregate (which can have any name): =================== ============================================================= datetime pandas datetimes, based on stress period end date site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name =================== ============================================================= 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> 2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`> obsnme_suffix_format : str, optional Format for suffix of obsnmes. By default '%b', which returns the abbreviated month name (e.g. 'Jan'). exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' obgnme_suffix : str Create new observation group names by appending this suffix to existing obgnmes, for example <existing obgnme>_<obgnme_suffix> by default, 'annual-mean' outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- aggregated : DataFrame With the following columns (in addition to the data columns in `base_data`, which now contain the aggregated values): =================== ============================================================= datetime datetimes that represent the mean of the period averaged month month of monthly average values site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name\ :sup:`3` =================== ============================================================= 1) With format <site_no>-<variable> or simply <site_no> 2) With suffix formatted with `obsnme_suffix_format` 3) With format of <original obgnme>_<obgnme_suffix> e.g. heads_annual-mean Notes ----- """ # validation checks #obsnme_date_suffix=True, #check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, # function_name='get_head_obs', obsdata=base_data) base_data['datetime'] = pd.to_datetime(base_data['datetime']) # only compute statistics on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() # only include times where there are both an observation and sim. equivalent base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True) grouped = base_data.groupby([base_data['site_no'], base_data['datetime'].dt.month]) aggregated = grouped.first() data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name] for c in data_cols: aggregated[c] = grouped[c].mean() aggregated['n'] = grouped.count().iloc[:, 0] aggregated.index.set_names(['site_no', 'month'], inplace=True) aggregated.reset_index(inplace=True) aggregated.sort_values(by=['site_no', 'month'], inplace=True) # change month column to be name instead of number aggregated['month'] = [f"{dt:%B}" for dt in aggregated['datetime']] aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower() for prefix, dt in zip(aggregated['obsprefix'], aggregated['datetime'])] aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}" for prefix in aggregated['obgnme']] cols = ['datetime', 'month', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n'] aggregated = aggregated[cols] if outfile is not None: aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(aggregated):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(aggregated, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return aggregated
[docs] def get_log10_observations(base_data, obsnme_prefix_variable='log10', obsnme_suffix_format='%Y%m%d', exclude_suffix='ss', obgnme_suffix='log10', fill_zeros_with=0, outfile=None, write_ins=False): r"""Create observations of log values. Log values are computed for all columns with floating point data. Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns, in addition to columns of floating point data to aggregate (which can have any name): =================== ============================================================= datetime pandas datetimes, based on stress period end date site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name =================== ============================================================= 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> 2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`> obsnme_prefix_variable : str Variable name indicating the type of observation. Is appended to the site number (site_no) to create new observation name prefixes (obsprefixes) in the format of <site_no>-<variable>. For example, by default, an observation prefix in base_data of '04026300-flow' or '04026300' (both for a site_no of '04026300'), would result in a new observation prefix of '04026300-flow-log10'. A log10 flow computed from an original flow observation named '04026300-flow_20221103' or '04026300_20221103' (at site '04026300' on 2022/11/03) would be named '04026300-flow-log10_20221103'. by default, 'log10'. obsnme_suffix_format : str, optional Format for suffix of obsnmes. By default ''%Y%m%d' (e.g. 20010101 for Jan 1, 2001) exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' obgnme_suffix : str Create new observation group names by appending this suffix to existing obgnmes, for example <existing obgnme>_<obgnme_suffix> by default, 'annual-mean' fill_zeros_with : numeric Fill value in log data for zero values in base_data. outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- log_base_data : DataFrame With the following columns (in addition to the data columns in `base_data`, which now contain the aggregated values): =================== ============================================================= datetime observation dates as datetimes site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name\ :sup:`3` =================== ============================================================= 1) With format <site_no>-<variable> or simply <site_no> 2) With suffix formatted with `obsnme_suffix_format` 3) With format of <original obgnme>_<obgnme_suffix> e.g. heads_annual-mean Notes ----- """ base_data['datetime'] = pd.to_datetime(base_data['datetime']) # only compute statistics on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() log_base_data = base_data.copy() data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name] for c in data_cols: log_base_data[c] = np.log10(base_data[c]) log_base_data.loc[base_data[c] == 0, c] = fill_zeros_with # handle zero values log_base_data['obsprefix'] = [f"{prefix}-{obsnme_prefix_variable}" for prefix in log_base_data['obsprefix']] log_base_data['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower() for prefix, dt in zip(log_base_data['obsprefix'], log_base_data['datetime'])] log_base_data['obgnme'] = [f"{prefix}_{obgnme_suffix}" for prefix in log_base_data['obgnme']] cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme'] log_base_data = log_base_data[cols] if outfile is not None: log_base_data.fillna(-1e30).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(log_base_data):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(log_base_data, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) return log_base_data
[docs] def get_baseflow_observations(base_data, obsnme_prefix_variable='baseflow', obsnme_suffix_format='%Y%m%d', exclude_suffix='ss', obgnme_suffix='baseflow', missing_obs_fill_value=0., outfile=None, write_ins=False, **kwargs): r"""Create observations of base flow using the BFI/Institute of Hydrology hydrograph separation method (:func:`mfobs.sep.ih_method`). Parameters ---------- base_data : DataFrame Table of preprocessed observations, such as that produced by :func:`mfobs.obs.get_base_obs`. Must have the following columns, in addition to columns of floating point data to aggregate (which can have any name): =================== ============================================================= datetime pandas datetimes, based on stress period end date site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name =================== ============================================================= 1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no> 2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`> obsnme_prefix_variable : str Variable name indicating the type of observation. Is appended to the site number (site_no) to create new observation name prefixes (obsprefixes) in the format of <site_no>-<variable>. For example, by default, an observation prefix in base_data of '04026300-flow' or '04026300' (both for a site_no of '04026300'), would result in a new observation prefix of '04026300-baseflow'. A base flow computed from a total flow observation named '04026300-flow_20221103' or '04026300_20221103' (at site '04026300' on 2022/11/03) would be named '04026300-baseflow_20221103'. by default, 'baseflow'. obsnme_suffix_format : str, optional Format for suffix of obsnmes. By default '%Y%m%d' (e.g. 20010101 for Jan 1, 2001) exclude_suffix : str or list-like Option to exclude observations from processing by suffix; e.g. 'ss' to include steady-state observations. By default, 'ss' obgnme_suffix : str Create new observation group names by appending this suffix to existing obgnmes, for example <existing obgnme>_<obgnme_suffix> by default, 'baseflow' missing_obs_fill_value : float Baseflow observations are different from most other observation types in that they are produced by hydrograph separation, which can lead to different numbers of observations with changing conditions. For example, a given parameter set may produce a total flow hydrograph with a different number of ordinates (turning points), which in turn can change the interpolation of baseflow values between ordinates, potentially producing either 'extra' observations (not in the instruction file) or missing observations. Missing observations will be filled with `missing_obs_value`, so that PEST doesn't crash. A value of 0. is recommended, as missing observations are often associated with low or no-flow periods, especially near the start or end of a timeseries. outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False **kwargs : key-word arguments to :func:`mfobs.sep.ih_method` Returns ------- bf_base_data : DataFrame With the following columns (in addition to the data columns in `base_data`, which now contain the aggregated values): =================== ============================================================= datetime observation dates as datetimes site_no unique site identifier obsprefix\ :sup:`1` prefix of observation name obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2` obgnme observation group name\ :sup:`3` =================== ============================================================= 1) With format <site_no>-<variable> or simply <site_no> 2) With suffix formatted with `obsnme_suffix_format` 3) With format of <original obgnme>_<obgnme_suffix> e.g. heads_annual-mean Notes ----- """ base_data['datetime'] = pd.to_datetime(base_data['datetime']) # only compute statistics on transient obs if isinstance(exclude_suffix, str): exclude_suffix = [exclude_suffix] suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme] keep = ~np.isin(suffix, exclude_suffix) base_data = base_data.loc[keep].copy() bf_base_data = base_data.copy() data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name] bf_base_data.index = pd.to_datetime(bf_base_data['datetime']) dfs = [] site_groups = bf_base_data.groupby('obsprefix') for obsprefix, group in site_groups: for c in data_cols: # series must be at least 2x the BFI block length (default=5) block_length = kwargs.get('block_length', 5) if len(group[c]) >= (block_length * 2): results = ih_method(group[c], **kwargs) group[c] = results['QB'] dfs.append(group) bf_base_data = pd.concat(dfs) # drop nan values bf_base_data.dropna(subset=['obsval', 'sim_obsval'], axis=0, inplace=True) bf_base_data['obsprefix'] = [f"{site_no}-{obsnme_prefix_variable}" for site_no in bf_base_data['site_no']] bf_base_data['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower() for prefix, dt in zip(bf_base_data['obsprefix'], bf_base_data['datetime'])] bf_base_data['obgnme'] = [f"{prefix}_{obgnme_suffix}" for prefix in bf_base_data['obgnme']] cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme'] bf_base_data = bf_base_data[cols] if outfile is not None: # write the instruction file if write_ins: write_insfile(bf_base_data, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column='sim_obsval', index=False) # otherwise, normalize observations to the instruction file else: insfile_obs = get_insfile_observations(str(outfile) + '.ins') missing_in_output = set(insfile_obs).difference(bf_base_data['obsnme']) extra_in_output = set(bf_base_data['obsnme']).difference(insfile_obs) # fill missing observations if any(missing_in_output): bf_base_data.index = bf_base_data['obsnme'] # re-index the observation data to add rows for the missing obs reindexed = bf_base_data.reindex(insfile_obs) # identify rows containing missing obs filled = reindexed['sim_obsval'].isna() # refill columns for the missing obs reindexed['obsnme'] = reindexed.index reindexed.loc[filled, 'datetime'] = [pd.Timestamp(s.split('_')[1].split('-')[0]) for s in reindexed.loc[filled, 'obsnme']] reindexed.loc[filled, 'site_no'] = [s.split('_')[0].split('-')[0] for s in reindexed.loc[filled, 'obsnme']] reindexed.loc[filled, 'obsprefix'] = [s.split('_')[0] for s in reindexed.loc[filled, 'obsnme']] reindexed.loc[filled, 'obgnme'] = 'filled-bf' reindexed.loc[filled, data_cols] = missing_obs_fill_value assert not reindexed.isna().any().any() bf_base_data = reindexed # drop any extra observations if any(extra_in_output): bf_base_data = bf_base_data.loc[~bf_base_data.index.isin(extra_in_output)] bf_base_data.to_csv(outfile, sep=' ', index=False) print(f'wrote {len(bf_base_data):,} observations to {outfile}') return bf_base_data