"""
General functions for creating observation input for PEST,
including base observations representing field measurements and simulated equivalents,
and derivative observations that are created from the base observations.
"""
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix
from mfobs.fileio import write_insfile, get_insfile_observations
from mfobs.sep import ih_method
from mfobs.utils import fill_nats, set_period_start_end_dates
[docs]
def get_base_obs(perioddata,
model_output,
observed_values_file,
variable_name=None,
simulated_values_column='sim_obsval',
observed_values_metadata_file=None,
observed_values_site_id_col='obsprefix',
observed_values_datetime_col='datetime',
obsnme_date_suffix=True,
obsnme_suffix_format='%Y%m',
observed_values_obsval_col='obsval',
observed_values_group_column='obgnme',
observed_values_unc_column='uncertainty',
aggregrate_observed_values_method='mean',
drop_groups=None,
label_period_as_steady_state=None, steady_state_period_start=None,
steady_state_period_end=None, forecast_sites=None,
forecast_start_date=None, forecast_end_date=None,
forecasts_only=False, forecast_sites_only=False,
outfile=None, verbose=True,
write_ins=False):
"""Get a set of base observations from a tables of model output, observed
values, and model time discretizaton.
Parameters
----------
perioddata : str, pathlike or DataFrame
Path to csv file or pandas DataFrame with start/end dates for stress periods or timesteps.
Must have a `start_datetime` and either a `time` column
(of elapsed times at the end of each period, in days),
or an `end_datetime` column of period end dates.
=================== =======================================================
start_datetime start date for each stress period or timestep
end_datetime (optional) end date for each stress period or timestep
time (optional) elapsed simulation time at the end of each
period, in days
=================== =======================================================
model_output : DataFrame
DataFrame with one head observation per row, with the following columns:
======================= =============================================================
site_no unique identifier for each site
variable variable name for the simulated values
obsprefix prefix of observation name (<site_no>-<variable>)
simulated_values_column column with simulated values
datetime pandas datetimes, based on stress period end date
layer zero-based model layer
obsnme observation name based on format of <obsprefix>_'%Y%m'
======================= =============================================================
Usually, a helper function such as :func:`mfobs.modflow.get_mf6_single_variable_obs`
or other custom code will be used to produce model_output from the
raw model output files.
Observation prefixes in the `obsprefix` column are assumed to be formated as
**<site_no>-<variable>**, where `site_no` is a unique identifier for a site
contained in `observed_values_file` and `variable` matches the `variable_name` argument
to this function (and is the variable name for the observed values in `observed_values_file`).
All observation prefixes are assumed to be case-insensitive (obsprefixes in the model_output
and obseved_values_files tables are all converted to lower case prior to matching).
observed_values_file : str, pathlike or DataFrame
Path to csv file or pandas DataFrame with observed values.
Must have the following columns (column names in brackets <> are supplied
with other arguments to this function):
============================== ================================================================================
<observed_values_obsval_col> Column with observed values
<observed_values_site_id_col> Column with unique identifier for each site
<observed_values_datetime_col> Column with date/time for each observation
<observed_values_unc_column> Column with estimated uncertainty values for each observation
============================== ================================================================================
variable_name : str, optional
Name for the type of values being processed,
e.g. 'head', 'downstream-flow' or 'stage'. Required if the `obsprefix`
values in the `model_output` table are formatted as **<site number>-<variable>**;
if supplied, the observation names created for the observations in
`observed_values_file` will also have **<site number>-<variable>** prefixes,
and the observed values and simulated equivalents can be aligned.
By default, None, in which case the `obsprefix` values in the `model_output`
table are assumed to be the site numbers; prefixes for observations
in `observed_values_file` will also then be site numbers.
simulated_values_column : str, optional
Column in model output with simulated values
observed_values_metadata_file : str, pathlike or DataFrame, optional
Table with site information for the observed values.
observed_values_obsval_col : str, optional
Column in obs_values_file with measured values
observed_values_site_id_col : str
Column with unique identifier for each site,
by default, 'obsprefix'
observed_values_datetime_col : str
Column with date/time for each observation,
by default, 'datetime'
observed_values_unc_column : str
Column with estimated uncertainty values for each observation,
by default, 'uncertainty'
aggregrate_observed_values_method : str
Method for aggregating observed values to time periods bracked by
aggregate_start_dates and aggregate_end_dates.
Can be any method used to aggregate values on a pandas groupby object
(e.g. 'mean', 'last', etc.)
obsnme_date_suffix : bool
If true, give observations a date-based suffix. Otherwise, assign a
stress period- or timestep-based suffix. In either case, the format of the suffix
is controlled by obsnme_suffix_format.
by default True
obsnme_suffix_format : str, optional
Format for suffix of obsnmes. Observation names are created following the format of
<obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
which would yield ``'202001'`` for a Jan, 2020 observation
(obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
should be a decimal format in the "new-style" string format
(e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
label_period_as_steady_state : int, optional
Zero-based model stress period where observations will be
assigned the suffix 'ss' instead of a date suffix.
By default, None, in which case all model output is assigned
a date suffix based on the start date of the stress period.
Passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs`.
steady_state_period_start : str, optional
Start date for the period representing steady-state conditions.
Observations between ``steady_state_period_start`` and ``steady_state_period_end``
will be averaged to create additional observations with the suffix 'ss'.
The steady-state averages will be matched to model output from the
stress period specified by ``label_period_as_steady_state``.
By default None, in which case all observations are used
(if ``observed_values_datetime_col is None``)
or no steady-state observations
are created (``observed_values_datetime_col`` exists).
steady_state_period_end : str, optional
End date for the period representing steady-state conditions.
By default None, in which case all observations are used
(if ``observed_values_datetime_col is None``)
or no steady-state observations
are created (``observed_values_datetime_col`` exists).
forecast_sites : str or sequence, optional
At these sites, observations will be created for each simulated value,
regardless is there is an observed equivalent. Can be supplied
as a sequence of site numbers (`site_id`s) or ``'all'`` to
include all sites. By default, None (no forecast sites).
forecast_start_date : str, optional
Start date for forecast period. When forecast_sites is not
``None``, forecast observations will be generated for each
time between `forecast_start_date` and `forecast_end_date`.
By default, None (generate forecasts for any time with missing values).
forecast_end_date : str, optional
End date for forecast period. When forecast_sites is not
``None``, forecast observations will be generated for each
time between `forecast_start_date` and `forecast_end_date`.
By default, None (generate forecasts for any time with missing values).
forecasts_only : bool, optional
Use this option to only output forecast observations
(those without an observed equivalent), subject to the parameters of
`forecast_sites`, `forecast_start_date`, and `forecast_end_date`.
forecast_sites_only : bool, optional
Option to only output observations at sites specified
with `forecast_sites` (has no effect if ``forecast_sites='all'``). If
``forecasts_only=False``, the output will include forecast and non-forecast
observations (for a continuous time-series).
verbose : bool
Controls the verbosity of screen outputs. If True, warnings will be printed
when there are no observations within a stress period and forecast_sites is None,
(meaning no observation results will be processed for that stress period), or when
supplied observations fall outside of the model timeframe. Within a workflow,
verbose can be set equal to write_ins for initial debugging, and the subsequently
set False during history matching runs.
by default, True
outfile : str, optional
[description], by default 'processed_flux_obs.dat'
write_ins : bool, optional
[description], by default False
Returns
-------
base_obs : DataFrame
Table of base observations with the following columns:
=================== =============================================================
datetime pandas datetimes, based on stress period end date
per MODFLOW stress period
site_no unique site identifier
obsprefix prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>
obsval observed values
sim_obsval simulated equivalents to observed values
obgnme observation group name
=================== =============================================================
"""
# validation checks
check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
function_name='get_base_obs')
outpath = Path('.')
if outfile is not None:
outpath = Path(outfile).parent
#obs_values_column = 'obsval'
output_sim_values_column = 'sim_obsval'
output_obs_values_column = 'obsval'
#if variable_name is None:
# obs_values_column = 'obsval' #'obs_value'
# #sim_values_column = 'sim_value'
#else:
# obs_values_column = 'obs_' + variable_name # output column with observed values
# #sim_values_column = 'sim_' + variable_name # output column with simulated equivalents to observed values
# read in/set up the perioddata table
if not isinstance(perioddata, pd.DataFrame):
perioddata = pd.read_csv(perioddata)
else:
perioddata = perioddata.copy()
set_period_start_end_dates(perioddata)
#perioddata.index = perioddata.per
# model results
results = model_output.copy()
results['obsprefix'] = results['obsprefix'].str.lower()
# rename columns to their defaults
renames = {observed_values_site_id_col: 'site_no',
observed_values_datetime_col: 'datetime',
observed_values_group_column: 'obgnme',
observed_values_unc_column: 'uncertainty'
}
# read in/set up the observed values table
if not isinstance(observed_values_file, pd.DataFrame):
observed = pd.read_csv(observed_values_file,
dtype={observed_values_site_id_col: object})
else:
observed = observed_values_file
observed.rename(columns=renames, inplace=True)
if 'obsprefix' not in observed.columns:
if variable_name is not None:
observed['obsprefix'] = [f"{sn}-{variable_name}"
for sn in observed['site_no']]
if forecast_sites is not None and forecast_sites != 'all':
forecast_sites = [f"{sn}-{variable_name}"
for sn in forecast_sites]
else:
observed['obsprefix'] = observed['site_no'].astype(str)
else:
observed['obsprefix'] = observed['obsprefix'].astype(str)
# read in the observed values metadata
if observed_values_metadata_file is not None:
if not isinstance(observed_values_metadata_file, pd.DataFrame):
metadata = pd.read_csv(observed_values_metadata_file,
dtype={observed_values_site_id_col: object})
else:
metadata = observed_values_metadata_file
metadata.rename(columns=renames, inplace=True)
if 'obsprefix' not in metadata.columns:
metadata['obsprefix'] = metadata[observed_values_site_id_col]
# join the metadata to the observed data
metadata.index = metadata['obsprefix'].values
observed.index = observed['obsprefix'].values
join_cols = [c for c in ['site_no', 'screen_top', 'screen_botm', 'x', 'y', 'layer']
if c in metadata.columns]
observed = observed.join(metadata[join_cols])
# convert obs names and prefixes to lower case
observed['obsprefix'] = observed['obsprefix'].str.lower()
# make a dictionary of site metadata for possible use later
temp = observed.copy()
temp.index = temp['obsprefix'].str.lower()
site_info_dict = temp.to_dict()
for col in 'datetime', 'obsval', observed_values_obsval_col:
if col in site_info_dict:
del site_info_dict[col]
del temp
# cast datetimes to pandas datetimes
# (observed data may not have a datetime col. if model is steady-state)
if observed_values_datetime_col is not None and 'datetime' in observed.columns:
try:
observed['datetime'] = pd.to_datetime(observed['datetime'])
except ValueError:
observed['datetime'] = pd.to_datetime(observed['datetime'], format='mixed')
# not necessarily True
observed['steady'] = False # flag for steady-state observations
elif label_period_as_steady_state is not None:
observed['datetime'] = pd.to_datetime(
perioddata['start_datetime'][label_period_as_steady_state])
observed['steady'] = True
else:
if len(perioddata) == 1:
observed['datetime'] = pd.to_datetime(perioddata['start_datetime'][0])
observed['steady'] = True
else:
raise ValueError("Model has more than one stress period "
"but observed data have no 'datetime' column.")
# drop model results that aren't in the obs information file
# these are probably observations that aren't in the model time period
# (and therefore weren't included in the parent model calibration;
# but modflow-setup would include them in the MODFLOW observation input)
# also drop sites that are in the obs information file, but not in the model results
# these include sites outside of the model (i.e. in the inset when looking at the parent)
no_info_sites = set(results.obsprefix).symmetric_difference(observed.obsprefix)
if forecast_sites == 'all':
# forecast observations at all simulated sites
# (only drop sites that aren't simulated)
no_info_sites = set(observed.obsprefix).difference(results.obsprefix)
elif forecast_sites is not None:
# remove selected forecast sites from 'no_info' sites to drop
forecast_sites = {s.lower() for s in forecast_sites}
no_info_sites = no_info_sites.difference(forecast_sites)
# dump these out to a csv
print('Dropping {} sites with no information'.format(len(no_info_sites)))
dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv'
results.loc[results.obsprefix.isin(no_info_sites)].to_csv(dropped_obs_outfile,
index=False)
results = results.loc[~results.obsprefix.isin(no_info_sites)].copy()
observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy()
if len(results) == 0:
raise ValueError("No matches between observation prefixes in the model results"
"and observed values! Check that the obsprefixes in the "
"observed_values_file match those in the model_output DataFrame.")
# for each model stress period or timestep, get the simulated values
# and the observed equivalents
observed.index = pd.to_datetime(observed.datetime)
observed.sort_index(inplace=True)
results.index = pd.to_datetime(results.datetime)
results.sort_index(inplace=True)
# integer column for stress period- or timestep-based obsnme suffixes
# timestep-based observations
if 'timestep' in perioddata.columns:
perioddata['unique_timestep'] = list(range(len(perioddata)))
per_column = 'unique_timestep'
# stress period-based observations
else:
per_column = 'per'
observed_simulated_combined = []
# if no datetime column is supplied with observations,
# only make steady state obs
if observed_values_datetime_col is None and \
label_period_as_steady_state is not None:
idx = label_period_as_steady_state
perioddata = perioddata[idx:idx+1]
for i, r in perioddata.iterrows():
# get the equivalent observed values
#start, end = perioddata.loc[per, ['start_datetime', 'end_datetime']]
start, end = r['start_datetime'], r['end_datetime']
# date-based suffix
if obsnme_date_suffix:
suffix = pd.Timestamp(end).strftime(obsnme_suffix_format)
# stress or timestep-based period-based suffix
else:
suffix = f"{r[per_column]:{obsnme_suffix_format.strip('{:}')}}"
# steady-state observations can represent a period
# other than the "modflow time" in the perioddata table
if r['per'] == label_period_as_steady_state:
suffix = 'ss'
if steady_state_period_start is not None:
start = steady_state_period_start
if steady_state_period_end is not None:
end = steady_state_period_end
# don't process observations for a steady-state period unless
# it is explicitly labeled as such and given a representative date range
elif r['steady']:
continue
observed_in_period_rs = aggregrate_to_period(
observed, start, end,
aggregrate_observed_values_method=aggregrate_observed_values_method,
obsnme_suffix=suffix)
if verbose and (forecast_sites is None) and (len(observed_in_period_rs) == 0):
if per_column == 'per':
warnings.warn(('Stress period {}: No observations between start and '
'end dates of {} and {}!'.format(r['per'], start, end)))
continue
# get the simulated equivalents
# first populate obsnmes
sim_in_period_rs = aggregrate_to_period(
results, start, end,
aggregrate_observed_values_method=aggregrate_observed_values_method,
obsnme_suffix=suffix)
if verbose and (sim_in_period_rs is None) or (len(sim_in_period_rs) == 0):
if per_column == 'per':
warnings.warn(('Stress period {}: No simulated equivalents between start and '
'end dates of {} and {}!'.format(r['per'], start, end)))
continue
if forecast_sites is not None:
observed_in_period_rs = observed_in_period_rs.reindex(sim_in_period_rs['obsnme'].values, axis=0)
obsprefix = observed_in_period_rs.index.str.split('_', expand=True).levels[0]
observed_in_period_rs['obsprefix'] = obsprefix
observed_in_period_rs['obsnme'] = observed_in_period_rs.index
observed_in_period_rs['datetime'] = sim_in_period_rs['datetime'].values[0]
any_simulated_obs = sim_in_period_rs.obsnme.isin(observed_in_period_rs.obsnme).any()
if verbose and not any_simulated_obs and (per_column == 'per'):
warnings.warn(('Stress period {}: No observation/simulated equivalent pairs for start and '
'end dates of {} and {}!'.format(r['per'], start, end)))
continue
observed_in_period_rs[output_sim_values_column] = sim_in_period_rs.reindex(observed_in_period_rs.index)[simulated_values_column]
# add stress period and observed values
observed_in_period_rs['per'] = r['per']
if per_column == 'unique_timestep':
observed_in_period_rs['timestep'] = r['timestep']
observed_in_period_rs['unique_timestep'] = r['unique_timestep']
observed_in_period_rs[output_obs_values_column] = observed_in_period_rs[observed_values_obsval_col]
observed_simulated_combined.append(observed_in_period_rs)
# Combined DataFrame of observed heads and simulated equivalents
if len(observed_simulated_combined) == 0:
msg = ("No overlap between model results "
f"({results['datetime'].min():%Y-%m-%d} to {results['datetime'].max():%Y-%m-%d})"
" and observed values "
f"({observed['datetime'].min():%Y-%m-%d} to {observed['datetime'].max():%Y-%m-%d})!"
)
raise ValueError(msg)
obsdata = pd.concat(observed_simulated_combined)
# raise an error if there are duplicates- reindexing below will fail if this is the case
if obsdata.index.duplicated().any():
msg = ('The following observations have duplicate names. There should only be'
'one observation per site, for each time period implied by the '
'obsnme_date_suffix_format parameter.\n{}'
.format(obsdata.loc[obsdata.duplicated()]))
raise ValueError(msg)
# drop any observations in specified groups
# (e.g. lake stages that should be compared with lake package output)
if drop_groups is not None and 'obgnme' in obsdata.columns:
obsdata = obsdata.loc[~obsdata.obgnme.isin(drop_groups)].copy()
if 'obgnme' not in obsdata.columns:
obsdata['obgnme'] = variable_name
# fill forecast obs with site info from observed dataframe
if forecast_sites is not None:
for k, v in site_info_dict.items():
obsdata[k] = [v.get(p, current)
for p, current in zip(obsdata['obsprefix'], obsdata[k])]
obsdata['obsnme'] = obsdata.index
else:
# nans are where sites don't have observation values for that period
# or sites that are in other model (inset or parent)
obsdata.dropna(subset=[output_obs_values_column], axis=0, inplace=True)
# label forecasts in own group
if forecast_sites is not None:
is_forecast = obsdata[output_obs_values_column].isna()
obsdata.loc[is_forecast, 'obgnme'] += '-forecast'
# cull forecasts to specified date window
# and specific sites (if specified)
keep_forecasts = is_forecast.copy() #np.array([True] * len(head_obs))
if forecast_start_date is not None:
keep_forecasts = (obsdata['datetime'] >= forecast_start_date)
if forecast_end_date is not None:
keep_forecasts &= (obsdata['datetime'] <= forecast_end_date)
#drop = drop & is_forecast
#head_obs = head_obs.loc[~drop].copy()
#is_forecast = head_obs[output_obs_values_column].isna()
if forecast_sites != 'all':
keep_forecasts &= obsdata['obsprefix'].isin(forecast_sites)
# option to only include forecast obs
# (those without observed equivalents)
if forecasts_only:
keep = keep_forecasts
else:
keep = keep_forecasts | ~is_forecast
# option to only include output from designated forecast sites
if forecast_sites_only:
keep = keep & obsdata['obsprefix'].isin(forecast_sites)
obsdata = obsdata.loc[keep].copy()
# add standard obsval and obgmne columns
columns = ['datetime', 'per', 'site_no', 'obsprefix', 'obsnme',
output_obs_values_column, output_sim_values_column,
'uncertainty', 'obgnme']
if output_obs_values_column != 'obsval':
obsdata['obsval'] = obsdata[output_obs_values_column]
columns.insert(-1, 'obsval')
# reorder the columns
columns = [c for c in columns if c in obsdata.columns]
base_obs = obsdata[columns].copy()
if 'layer' in columns:
base_obs['layer'] = base_obs['layer'].astype(int)
# fill NaT (not a time) datetimes
fill_nats(base_obs, perioddata)
base_obs.sort_values(by=['obsprefix', 'per'], inplace=True)
if outfile is not None:
base_obs.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(base_obs):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(base_obs, str(outfile) + '.ins', obsnme_column='obsnme',
simulated_obsval_column=output_sim_values_column, index=False)
return base_obs
[docs]
def aggregrate_to_period(data, start, end, obsnme_suffix,
aggregrate_observed_values_method='mean',
):
data_in_period = data.loc[start:end].reset_index(drop=True)
if (len(data_in_period) == 0):
cols = list(data.columns) + ['obsnme']
data_in_period_rs = pd.DataFrame(columns=cols)
data_in_period_rs.index = data_in_period_rs['obsnme']
return data_in_period_rs
data_in_period.sort_values(by=['obsprefix', 'datetime'], inplace=True)
if 'n' not in data_in_period.columns:
data_in_period['n'] = 1
by_site = data_in_period.groupby('obsprefix')
data_in_period_rs = by_site.first()
data_cols = [c for c, dtype in data_in_period.dtypes.items()
if 'float' in dtype.name]
for c in data_cols:
data_in_period_rs[c] = getattr(by_site[c], aggregrate_observed_values_method)()
data_in_period_rs['n'] = by_site.n.sum()
data_in_period_rs['datetime'] = pd.Timestamp(end)
data_in_period_rs.reset_index(inplace=True) # put obsprefix back
missing_cols = set(data_in_period.columns).difference(data_in_period_rs.columns)
for col in missing_cols:
data_in_period_rs[col] = by_site[col].first().values
data_in_period_rs = data_in_period_rs[data_in_period.columns]
obsnames = ['{}_{}'.format(prefix.lower(), obsnme_suffix)
for prefix in data_in_period_rs.obsprefix]
data_in_period_rs['obsnme'] = obsnames
data_in_period_rs.index = data_in_period_rs['obsnme']
return data_in_period_rs
[docs]
def get_spatial_differences(base_data, perioddata,
difference_sites,
obs_values_col='obs_head',
sim_values_col='sim_head',
use_gradients=False,
sep='-d-',
write_ins=False, outfile=None):
r"""Takes the base_data dataframe output by :func:`mfobs.obs.get_obs` and creates
spatial difference observations. Optionally writes an output csvfile
and a PEST instruction file.
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns:
=================== ========================================================================
datetime pandas datetimes, based on stress period end date
per MODFLOW stress period
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>
obsval observed values
sim_obsval simulated equivalents to observed values
obgnme observation group name
screen_top (optional) well open interval top; required if ``use_gradients=True``
screen_botm (optional) well open interval bottom; required if ``use_gradients=True``
=================== ========================================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
perioddata : DataFrame
DataFrame with start/end dates for stress periods. Must have columns
'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
and 'end_datetime' (end date for the stress period).
difference_sites : dict
Dictionary of site numbers (keys) and other site numbers to compare to (values).
Values can be a string for a single site, a list of strings for multiple sites,
or a string pattern contained in multiple site numbers;
observations at the sites represented in the values will be compared to the observation
at the site represented by the key, at times of coincident measurements. Differences
are computed by subtracting the values site(s) from the key site, so for example,
to represent a gain in streamflow as positive, the downstream site should be key site.
obs_values_col : str
Column in ``base_data`` with observed values
sim_values_col : str
Column in `base_data`` with simulated equivalent values
use_gradients : bool
If True, compute vertical hydraulic gradients and use those for the
observation values, if False, use differences. For this option,
'screen_top' and 'screen_botm' columns are needed in ``base_data``.
By default False.
sep : str
Separator in spatial difference obsnnames. For example, with
sites "site1" and "site2" at time "202001", and sep='-d-', the obsnme
would be "site1-d-site2_202001".
by default, '-d-'
outfile : str, optional
CSV file to write output to. Nan values are filled with -1e30.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
=======
spatial_differences : DataFrame
Spatial difference observations. Columns:
================= ===================================================================================
datetime observation date-time labels
per model stress period
obsprefix observation name prefix (site identifier)
obsnme1 name of observation from keys of ``difference_sites``
<obs_values_col>1 observed value associated with obsnme1
<sim_values_col>1 simulated equivalent associated with obsnme1
screen_top1 well screen top (elevation) associated with obsnme1*
screen_botm1 well screen botm (elevation) associated with obsnme1*
layer1 model layer associated with obsnme1*
obsnme2 name of observation from value(s) in ``difference_sites`` (associated with obsnme1)
<obs_values_col>2 observed value associated with obsnme2
<sim_values_col>2 simulated equivalent associated with obsnme2
screen_top2 well screen top (elevation) associated with obsnme2*
screen_botm2 well screen botm (elevation) associated with obsnme2*
layer2 model layer associated with obsnme2*
obs_diff observed difference between obsnme1 and obsnme2
sim_diff simulated equivalent difference between obsnme1 and obsnme2
dz distance between well screen midpoints for obsnme1 and obsnme2*
obs_grad observed vertical hydraulic gradient between obsnme1 and obsnme2*
sim_grad simulated equivalent vertical hydraulic gradient between obsnme1 and obsnme2*
obgnme observation group
obsnme spatial difference observation name
obsval observation value (i.e. for PEST control file)
sim_obsval simulated equivalent (i.e. for PEST instruction file)
type description of spatial difference observations
uncertainty (loosely) error-based uncertainty, assumed to be 2x that of obsnme2
================= ===================================================================================
Notes:
* * denotes optional columns that may not be present.
* Columns relating to well open interval are only created if
``base_data`` has 'screen_top' and 'screen_botm' columns.
* Negative difference or gradient values indicate a gradient towards the key site.
"""
# copy the base data to prevent unintended side-effects
base_data = base_data.copy()
# model stress period data:
perioddata = perioddata.copy()
# make sure start and end dates don't overlap
set_period_start_end_dates(perioddata)
perioddata.index = perioddata.per
# rename the observed and sim. eq. values columns
#if variable is not None:
# renames = {obs_values_col: f'obs_{variable}',
# sim_values_col: f'sim_{variable}',
# }
# obs_values_col = f'obs_{variable}'
# sim_values_col = f'sim_{variable}'
#else:
renames = {obs_values_col: 'base_obsval', # f'obs_{variable}',
sim_values_col: 'base_sim_obsval', # f'sim_{variable}',
}
obs_values_col = f'base_obsval'
sim_values_col = f'base_sim_obsval'
base_data.drop([#f'obs_{variable}', f'sim_{variable}',
'base_obsval', 'bas_sim_obsval'], axis=1,
inplace=True, errors='ignore')
base_data.rename(columns=renames, inplace=True)
# rename the observed and sim. eq. values columns
#renames = {obs_values_col: f'obs_{variable}',
# sim_values_col: f'sim_{variable}',
# }
#base_data.drop([f'obs_{variable}', f'sim_{variable}'], axis=1,
# inplace=True, errors='ignore')
#base_data.rename(columns=renames, inplace=True)
#obs_values_col = f'obs_{variable}'
#sim_values_col = f'sim_{variable}'
# get a set of unique variables
# (including '', which signifies no variable in the obsprefix,
# just a site number)
variables = {s.split('-')[1] if len(s.split('-')) > 1 else ''
for s in base_data['obsprefix']}
# get subset of base_data sites to compare to each key site in difference_sites
base_data_obsprefixes = set(base_data['obsprefix'])
groups = base_data.groupby('obsprefix')
spatial_differences = []
# key site: value/comparison sites can be include variables or not
# if they don't, and the base_data includes variables,
# process each obsprefix with that site number
for variable in variables:
for key_obsprefix, patterns in difference_sites.items():
if len(variable) > 0:
key_obsprefix = f"{key_obsprefix.split('-')[0]}-{variable}"
if key_obsprefix not in base_data_obsprefixes:
print((f'warning: observation prefix {key_obsprefix} not in base_data. '
'Skipping spatial differencing.'))
continue
compare = []
if isinstance(patterns, str):
patterns = [patterns]
# matching obsprefixes must have the pattern and the variable
# if obsprefixes are simply formatted as site numbers
# (no variables)
# then the only variable will be an empty string,
# which will match any string
for pattern in patterns:
matches = [True if pattern in obsprefix and variable in obsprefix
else False for obsprefix in base_data.obsprefix]
compare.append(matches)
compare = np.any(compare, axis=0)
prefixes = set(base_data.loc[compare, 'obsprefix'])
# for each site in the subset, compare the values to the keys site
# index by stress period
key_values = groups.get_group(key_obsprefix).copy()
key_values.index = key_values.per
for obsprefix, site_observations in groups:
if obsprefix in prefixes:
site_obs = site_observations.copy()
site_obs.rename(columns={obs_values_col: f"{obs_values_col}2", # 'obs_head2',
sim_values_col: f"{sim_values_col}2", # 'sim_head2',
'obsnme': 'obsnme2',
'screen_top': 'screen_top2',
'screen_botm': 'screen_botm2',
'layer': 'layer2'
}, inplace=True)
site_obs.index = site_obs.per
site_obs['obsnme1'] = key_values['obsnme']
site_obs[f"{obs_values_col}1"] = key_values[obs_values_col]
site_obs[f"{sim_values_col}1"] = key_values[sim_values_col]
if 'screen_top' in key_values.columns:
site_obs['screen_top1'] = key_values['screen_top']
if use_gradients and 'screen_botm' not in key_values.columns:
raise ValueError("use_gradients=True requires both screen_top "
"and screen_botm columns in base_data")
if 'screen_botm' in key_values.columns:
site_obs['screen_botm1'] = key_values['screen_botm']
if 'layer2' in site_obs.columns:
site_obs['layer1'] = key_values['layer']
# negative values indicate gradient towards key site
# (key site head < values site head)
site_obs['obs_diff'] = site_obs[f"{obs_values_col}1"] - site_obs[f"{obs_values_col}2"]
site_obs['sim_diff'] = site_obs[f"{sim_values_col}1"] - site_obs[f"{sim_values_col}2"]
# get a screen midpoint and add gradient
screen_midpoint1 = None
if use_gradients and {'screen_top1', 'screen_botm1'}.intersection(site_obs.columns):
screen_midpoint1 = site_obs[['screen_top1', 'screen_botm1']].mean(axis=1)
if use_gradients and {'screen_top2', 'screen_botm2'}.intersection(site_obs.columns):
screen_midpoint2 = site_obs[['screen_top2', 'screen_botm2']].mean(axis=1)
if screen_midpoint1 is not None:
site_obs['dz'] = (screen_midpoint1 - screen_midpoint2)
site_obs['obs_grad'] = site_obs['obs_diff'] / site_obs['dz']
site_obs['sim_grad'] = site_obs['sim_diff'] / site_obs['dz']
# default obgnme prefix
if 'obgnme' not in site_obs.columns:
if len(variable) == 0:
site_obs['obgnme'] = 'sdiff'
else:
site_obs['obgnme'] = variable
# obgnme = <prefix>_sdiff
# unless there is no prefix
site_obs['obgnme'] = [f'{g}_sdiff' if g != 'sdiff' else g
for g in site_obs['obgnme']]
# whether to use gradients for the obsvals, or just head differences
if use_gradients:
site_obs['obsval'] = site_obs['obs_grad']
site_obs['sim_obsval'] = site_obs['sim_grad']
if len(variable) == 0:
site_obs['type'] = f'gradient'
else:
site_obs['type'] = f'{variable} gradient'
else:
site_obs['obsval'] = site_obs['obs_diff']
site_obs['sim_obsval'] = site_obs['sim_diff']
if len(variable) == 0:
site_obs['type'] = f'spatial difference'
else:
site_obs['type'] = f'spatial {variable} difference'
spatial_differences.append(site_obs)
if len(spatial_differences) == 0:
raise ValueError('No spatial difference site/variable pairs found! '
'Check that the key sites and their compare patterns exist in the base_data.')
spatial_differences = pd.concat(spatial_differences)
spatial_differences.dropna(subset=['obs_diff', 'sim_diff'], axis=0, inplace=True)
# name the spatial head difference obs as
# <obsprefix1><sep><obsprefix2>_<suffix>
obsnme = []
obsprefix = []
for i, r in spatial_differences.iterrows():
prefix1, suffix1 = r.obsnme1.split('_')
prefix2, suffix2 = r.obsnme2.split('_')
assert suffix1 == suffix2, "Observations are at different times! {}, {}".format(r.obsnme1,
r.obsnme2)
# if the obsprefixes include variables; only retain the the second instance
prefix = f"{prefix1.split('-')[0]}{sep}{prefix2}"
obsnme.append('{}_{}'.format(prefix, suffix2))
obsprefix.append(prefix)
spatial_differences['obsnme'] = obsnme
spatial_differences['obsprefix'] = obsprefix
# clean up columns
cols = ['datetime', 'per', 'obsprefix',
'obsnme1', f"{obs_values_col}1", f"{sim_values_col}1", 'screen_top1', 'screen_botm1', 'layer1',
'obsnme2', f"{obs_values_col}2", f"{sim_values_col}2", 'screen_top2', 'screen_botm2', 'layer2',
'obs_diff', 'sim_diff', 'dz', 'obs_grad', 'sim_grad',
'obsnme', 'obsval', 'sim_obsval', 'obgnme', 'type'
]
cols = [c for c in cols if c in spatial_differences.columns]
spatial_differences = spatial_differences[cols]
spatial_differences.dropna(axis=0, subset=['obsval'], inplace=True)
# uncertainty column is from base_data;
# assume that spatial head differences have double the uncertainty
# (two wells/two measurements per obs)
if 'uncertainty' in spatial_differences.columns:
spatial_differences['uncertainty'] *= 2
# check for duplicates
assert not spatial_differences['obsnme'].duplicated().any()
# fill NaT (not a time) datetimes
fill_nats(spatial_differences, perioddata)
if outfile is not None:
spatial_differences.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(spatial_differences):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(spatial_differences, str(outfile) + '.ins',
obsnme_column='obsnme',
simulated_obsval_column='sim_obsval', index=False)
return spatial_differences
[docs]
def get_temporal_differences(base_data, perioddata,
obs_values_col='obs_head',
sim_values_col='sim_head',
variable='head',
get_displacements=False,
displacement_from=None,
obsnme_date_suffix=True,
obsnme_suffix_format='%Y%m',
exclude_suffix='ss',
exclude_obs=None,
outfile=None,
write_ins=False):
"""Takes the base_data dataframe output by :func:`mfobs.obs.get_obs`,
creates temporal difference observations. Optionally writes an output csvfile
and a PEST instruction file.
Parameters
----------
base_data : DataFrame
Head observation data with same column structure as
output from :func:`mfobs.obs.get_obs`
perioddata : DataFrame
DataFrame with start/end dates for stress periods. Must have columns
'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
and 'end_datetime' (end date for the stress period).
obs_values_col : str
Column in ``base_data`` with observed values
sim_values_col : str
Column in `base_data`` with simulated equivalent values
variable : str {'head', 'flux', or other}
Type of observation being processed. Simulated and observed values
columns are named in the format 'sim_<variable>' and 'obs_<variable>',
respectively. If there is no 'obgnme' column in ``base_data``,
``variable`` is also used as a default base group name.
get_displacements : bool
If True, compute the displacement of each observation from
a datum (specified by ``displacement_from``). If False, difference
each observation with the previous observation.
by default, False
displacement_from : str or date-like
Datum for computing displacements. Must be in a format that can be
used for time slicing in pandas (e.g. '2010-01-01', which would result
in displacements from the first observation on or after '2010-01-01' at each site,
or None, which would result in displacements from the first observation
at each site. By default, None
non-zero weighted observation
obsnme_date_suffix : bool
If true, give observations a date-based suffix. Otherwise, assign a
stress period-based suffix. In either case, the format of the suffix
is controlled by obsnme_suffix_format.
by default True
obsnme_suffix_format : str, optional
Format for suffix of obsnmes. Observation names are created following the format of
<obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
which would yield ``'202001'`` for a Jan, 2020 observation
(obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
should be a decimal format in the "new-style" string format
(e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
exclude_obs : list-like
Sequence of observation names to exclude from return/written dataset. For example,
if sequential head differences are also being computed, the first displacement observation
after the reference observation will be a duplicate of the first sequential head difference
observation. By default, None (no observations excluded).
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
period_diffs : DataFrame
Notes
-----
Differences are computed by subtracting the previous time from the current,
so a positive value indicates an increase.
"""
base_data = base_data.copy()
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# validation checks
check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
function_name='get_head_obs', obsdata=base_data)
if base_data.columns.duplicated().any():
raise ValueError('Duplicate column names in base_data:\n'
f'{base_data.columns[base_data.columns.duplicated(keep=False)]}')
# rename the observed and sim. eq. values columns
if variable is not None:
renames = {obs_values_col: f'obs_{variable}',
sim_values_col: f'sim_{variable}',
}
obs_values_col = f'obs_{variable}'
sim_values_col = f'sim_{variable}'
else:
renames = {obs_values_col: 'base_obsval', # f'obs_{variable}',
sim_values_col: 'base_sim_obsval', # f'sim_{variable}',
}
obs_values_col = f'base_obsval'
sim_values_col = f'base_sim_obsval'
base_data.drop([f'obs_{variable}', f'sim_{variable}',
'base_obsval', 'bas_sim_obsval'], axis=1,
inplace=True, errors='ignore')
base_data.rename(columns=renames, inplace=True)
# rename the observed and sim. eq. values columns
#renames = {obs_values_col: f'obs_{variable}',
# sim_values_col: f'sim_{variable}',
# }
#base_data.drop([f'obs_{variable}', f'sim_{variable}'], axis=1,
# inplace=True, errors='ignore')
#base_data.rename(columns=renames, inplace=True)
#obs_values_col = f'obs_{variable}'
#sim_values_col = f'sim_{variable}'
# only compute differences on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
# group observations by site (prefix)
sites = base_data.groupby('obsprefix')
period_diffs = []
# get index position for obsval columns,
# after adding them if needed
# (this allows assignment of new values purely by iloc,
# which avoids setting on slices in loop below)
if 'obsval' in base_data.columns:
obsval_col_iloc = base_data.columns.get_loc('obsval')
else:
base_data['obsval'] = -1e30
obsval_col_iloc = len(base_data.columns) - 1
if 'sim_obsval' in base_data.columns:
sim_obsval_col_iloc = base_data.columns.get_loc('sim_obsval')
else:
base_data['sim_obsval'] = -1e30
sim_obsval_col_iloc = len(base_data.columns) - 1
for site_no, values in sites:
values = values.sort_values(by=['per']).copy()
values.index = values['datetime']
# compute the differences
if get_displacements:
values = values.sort_index().loc[displacement_from:].copy()
# some sites may not have any measurements
# after displacement datum; skip these
if len(values) <= 1:
continue
values['obsval'] = values[obs_values_col] - \
values[obs_values_col].iloc[0]
values['sim_obsval'] = values[sim_values_col] - \
values[sim_values_col].iloc[0]
# assign np.nan to starting displacements (of 0)
# (so they get dropped later on,
# consistent with workflow for sequential difference obs)
values.iloc[0, obsval_col_iloc] = np.nan
values.iloc[0, sim_obsval_col_iloc] = np.nan
#values['obsval'].iloc[0] = np.nan
#values['sim_obsval'].iloc[0] = np.nan
else:
values['obsval'] = values[obs_values_col].diff()
values['sim_obsval'] = values[sim_values_col].diff()
# name the temporal difference obs as
# <obsprefix>_<obsname1 suffix>d<obsname2 suffix>
# where the obsval = obsname2 - obsname1
obsnme = []
for i, (idx, r) in enumerate(values.iterrows()):
obsname2_suffix = ''
if i > 0:
if get_displacements:
obsname_2_loc = 0
else:
obsname_2_loc = i - 1
# date-based suffixes
if obsnme_date_suffix:
obsname2_suffix = values.iloc[obsname_2_loc] \
['datetime'].strftime(obsnme_suffix_format)
# stress period-based suffixes
else:
per = values.iloc[obsname_2_loc]['per']
obsname2_suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"
obsnme.append('{}d{}'.format(r.obsnme, obsname2_suffix))
values['obsnme'] = obsnme
# todo: is there a general uncertainty approach for temporal differences that makes sense?
period_diffs.append(values)
period_diffs = pd.concat(period_diffs).reset_index(drop=True)
# some checks to make sure the assignments from the above loop are correct
assert period_diffs.iloc[:, sim_obsval_col_iloc].name == 'sim_obsval'
assert period_diffs.iloc[:, obsval_col_iloc].name == 'obsval'
assert not np.any(period_diffs[['obsval', 'sim_obsval']] == -1e30)
period_diffs['datetime'] = pd.to_datetime(period_diffs['datetime'])
if 'obgnme' not in period_diffs.columns:
period_diffs['obgnme'] = variable
if get_displacements:
period_diffs['type'] = f'{variable} displacement'
period_diffs['obgnme'] = [f'{g}_disp' for g in period_diffs['obgnme']]
else:
period_diffs['type'] = f'temporal {variable} difference'
period_diffs['obgnme'] = [f'{g}_tdiff' for g in period_diffs['obgnme']]
# drop some columns that aren't really valid; if they exist
period_diffs.drop(['n'], axis=1, inplace=True, errors='ignore')
# clean up columns
cols = ['datetime', 'per', 'obsprefix', 'obsnme',
f'obs_{variable}', f'sim_{variable}', 'screen_top', 'screen_botm', 'layer',
'obsval', 'sim_obsval', 'obgnme', 'type']
cols = [c for c in cols if c in period_diffs.columns]
period_diffs = period_diffs[cols].copy()
# drop observations with no difference (first observations at each site)
period_diffs.dropna(axis=0, subset=['sim_obsval'], inplace=True)
# drop any excluded obs
if exclude_obs is not None:
exclude_obs = set(exclude_obs)
print(f"dropping {len(exclude_obs)} observations specified with exclude_obs")
period_diffs = period_diffs.loc[~period_diffs['obsnme'].isin(exclude_obs)].copy()
# fill NaT (not a time) datetimes
fill_nats(period_diffs, perioddata)
if outfile is not None:
period_diffs.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(period_diffs):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(period_diffs, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
return period_diffs
[docs]
def get_annual_means(base_data,
obsnme_suffix_format='%Y',
exclude_suffix='ss',
obgnme_suffix='annual-mean',
outfile=None,
write_ins=False):
r"""Create observations of annual mean values for a set of
base observations. Means are computed for all columns with
floating point data.
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns,
in addition to columns of floating point data to aggregate
(which can have any name):
=================== =============================================================
datetime pandas datetimes, based on stress period end date
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name
=================== =============================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`>
obsnme_suffix_format : str, optional
Format for suffix of obsnmes.
By default '%Y', which returns the 4-digit year
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
obgnme_suffix : str
Create new observation group names by appending this suffix to existing
obgnmes, for example <existing obgnme>_<obgnme_suffix>
by default, 'annual-mean'
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
aggregated : DataFrame
With the following columns (in addition to the data columns in `base_data`,
which now contain the aggregated values):
=================== =============================================================
datetime year of annual mean, as datetime
year year of annual mean, as integer
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name\ :sup:`3`
=================== =============================================================
1) With format <site_no>-<variable> or simply <site_no>
2) With suffix formatted with `obsnme_suffix_format`
3) With format of <original obgnme>_<obgnme_suffix>
e.g. heads_annual-mean
Notes
-----
"""
# validation checks
#obsnme_date_suffix=True
#check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
# function_name='get_head_obs', obsdata=base_data)
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# only compute statistics on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
# only include times where there are both an observation and sim. equivalent
base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True)
grouped = base_data.groupby([base_data['site_no'], base_data['datetime'].dt.year])
aggregated = grouped.first()
data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name]
for c in data_cols:
aggregated[c] = grouped[c].mean()
aggregated['n'] = grouped.count().iloc[:, 0]
aggregated.index.set_names(['site_no', 'year'], inplace=True)
aggregated.reset_index(inplace=True)
aggregated.sort_values(by=['site_no', 'year'], inplace=True)
aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}"
for prefix, dt in zip(aggregated['obsprefix'],
aggregated['datetime'])]
aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}"
for prefix in aggregated['obgnme']]
cols = ['datetime', 'year', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n']
aggregated = aggregated[cols]
if outfile is not None:
aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(aggregated):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(aggregated, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
return aggregated
[docs]
def get_monthly_means(base_data,
obsnme_suffix_format='%Y%m',
exclude_suffix='ss',
obgnme_suffix='monthly-mean',
outfile=None,
write_ins=False):
r"""Create observations of monthly mean values for a set of
base observations. Means are computed for all columns with
floating point data.
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns,
in addition to columns of floating point data to aggregate
(which can have any name):
=================== =============================================================
datetime pandas datetimes, based on stress period end date
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name
=================== =============================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`>
obsnme_suffix_format : str, optional
Format for suffix of obsnmes.
By default '%Y%m', which returns the year and month as consecutive integers,
e.g. 200101 for Jan, 2001.
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
obgnme_suffix : str
Create new observation group names by appending this suffix to existing
obgnmes, for example <existing obgnme>_<obgnme_suffix>
by default, 'annual-mean'
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
aggregated : DataFrame
With the following columns (in addition to the data columns in `base_data`,
which now contain the aggregated values):
=================== =============================================================
datetime aggregated dates as datetime objects
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name\ :sup:`3`
=================== =============================================================
1) With format <site_no>-<variable> or simply <site_no>
2) With suffix formatted with `obsnme_suffix_format`
3) With format of <original obgnme>_<obgnme_suffix>
e.g. heads_annual-mean
Notes
-----
"""
# validation checks
#obsnme_date_suffix=True
#check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
# function_name='get_head_obs', obsdata=base_data)
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# only compute statistics on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
# only include times where there are both an observation and sim. equivalent
base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True)
grouped = base_data.groupby([base_data['site_no'],
base_data['datetime'].dt.year,
base_data['datetime'].dt.month])
aggregated = grouped.first()
data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name]
for c in data_cols:
aggregated[c] = grouped[c].mean()
aggregated['n'] = grouped.count().iloc[:, 0]
aggregated.index.set_names(['site_no', 'year', 'month'], inplace=True)
aggregated.reset_index(inplace=True)
aggregated.sort_values(by=['site_no', 'year', 'month'], inplace=True)
aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}"
for prefix, dt in zip(aggregated['obsprefix'],
aggregated['datetime'])]
aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}"
for prefix in aggregated['obgnme']]
cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n']
aggregated = aggregated[cols]
if outfile is not None:
aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(aggregated):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(aggregated, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
return aggregated
[docs]
def get_mean_monthly(base_data,
obsnme_suffix_format='%b',
exclude_suffix='ss',
obgnme_suffix='mean-monthly',
outfile=None,
write_ins=False):
r"""Create observations of mean monthly, or means for months of the year
(for example, the mean for Jan. across all years). Means are computed for
all columns with floating point data.
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns,
in addition to columns of floating point data to aggregate
(which can have any name):
=================== =============================================================
datetime pandas datetimes, based on stress period end date
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name
=================== =============================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`>
obsnme_suffix_format : str, optional
Format for suffix of obsnmes.
By default '%b', which returns the abbreviated month name (e.g. 'Jan').
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
obgnme_suffix : str
Create new observation group names by appending this suffix to existing
obgnmes, for example <existing obgnme>_<obgnme_suffix>
by default, 'annual-mean'
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
aggregated : DataFrame
With the following columns (in addition to the data columns in `base_data`,
which now contain the aggregated values):
=================== =============================================================
datetime datetimes that represent the mean of the period averaged
month month of monthly average values
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name\ :sup:`3`
=================== =============================================================
1) With format <site_no>-<variable> or simply <site_no>
2) With suffix formatted with `obsnme_suffix_format`
3) With format of <original obgnme>_<obgnme_suffix>
e.g. heads_annual-mean
Notes
-----
"""
# validation checks
#obsnme_date_suffix=True,
#check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
# function_name='get_head_obs', obsdata=base_data)
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# only compute statistics on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
# only include times where there are both an observation and sim. equivalent
base_data.dropna(subset=['obsval', 'sim_obsval'], inplace=True)
grouped = base_data.groupby([base_data['site_no'],
base_data['datetime'].dt.month])
aggregated = grouped.first()
data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name]
for c in data_cols:
aggregated[c] = grouped[c].mean()
aggregated['n'] = grouped.count().iloc[:, 0]
aggregated.index.set_names(['site_no', 'month'], inplace=True)
aggregated.reset_index(inplace=True)
aggregated.sort_values(by=['site_no', 'month'], inplace=True)
# change month column to be name instead of number
aggregated['month'] = [f"{dt:%B}" for dt in aggregated['datetime']]
aggregated['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower()
for prefix, dt in zip(aggregated['obsprefix'],
aggregated['datetime'])]
aggregated['obgnme'] = [f"{prefix}_{obgnme_suffix}"
for prefix in aggregated['obgnme']]
cols = ['datetime', 'month', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme', 'n']
aggregated = aggregated[cols]
if outfile is not None:
aggregated.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(aggregated):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(aggregated, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
return aggregated
[docs]
def get_log10_observations(base_data,
obsnme_prefix_variable='log10',
obsnme_suffix_format='%Y%m%d',
exclude_suffix='ss',
obgnme_suffix='log10',
fill_zeros_with=0,
outfile=None,
write_ins=False):
r"""Create observations of log values. Log values are computed for
all columns with floating point data.
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns,
in addition to columns of floating point data to aggregate
(which can have any name):
=================== =============================================================
datetime pandas datetimes, based on stress period end date
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name
=================== =============================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`>
obsnme_prefix_variable : str
Variable name indicating the type of observation. Is appended to the site number (site_no)
to create new observation name prefixes (obsprefixes) in the format of <site_no>-<variable>.
For example, by default, an observation prefix in base_data of '04026300-flow' or '04026300'
(both for a site_no of '04026300'), would result in a new observation prefix of
'04026300-flow-log10'. A log10 flow computed from an original flow observation named '04026300-flow_20221103'
or '04026300_20221103' (at site '04026300' on 2022/11/03) would be named '04026300-flow-log10_20221103'.
by default, 'log10'.
obsnme_suffix_format : str, optional
Format for suffix of obsnmes.
By default ''%Y%m%d' (e.g. 20010101 for Jan 1, 2001)
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
obgnme_suffix : str
Create new observation group names by appending this suffix to existing
obgnmes, for example <existing obgnme>_<obgnme_suffix>
by default, 'annual-mean'
fill_zeros_with : numeric
Fill value in log data for zero values in base_data.
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
log_base_data : DataFrame
With the following columns (in addition to the data columns in `base_data`,
which now contain the aggregated values):
=================== =============================================================
datetime observation dates as datetimes
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name\ :sup:`3`
=================== =============================================================
1) With format <site_no>-<variable> or simply <site_no>
2) With suffix formatted with `obsnme_suffix_format`
3) With format of <original obgnme>_<obgnme_suffix>
e.g. heads_annual-mean
Notes
-----
"""
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# only compute statistics on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
log_base_data = base_data.copy()
data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name]
for c in data_cols:
log_base_data[c] = np.log10(base_data[c])
log_base_data.loc[base_data[c] == 0, c] = fill_zeros_with # handle zero values
log_base_data['obsprefix'] = [f"{prefix}-{obsnme_prefix_variable}"
for prefix in log_base_data['obsprefix']]
log_base_data['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower()
for prefix, dt in zip(log_base_data['obsprefix'],
log_base_data['datetime'])]
log_base_data['obgnme'] = [f"{prefix}_{obgnme_suffix}"
for prefix in log_base_data['obgnme']]
cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme']
log_base_data = log_base_data[cols]
if outfile is not None:
log_base_data.fillna(-1e30).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(log_base_data):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(log_base_data, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
return log_base_data
[docs]
def get_baseflow_observations(base_data,
obsnme_prefix_variable='baseflow',
obsnme_suffix_format='%Y%m%d',
exclude_suffix='ss',
obgnme_suffix='baseflow',
missing_obs_fill_value=0.,
outfile=None,
write_ins=False, **kwargs):
r"""Create observations of base flow using the
BFI/Institute of Hydrology hydrograph separation method (:func:`mfobs.sep.ih_method`).
Parameters
----------
base_data : DataFrame
Table of preprocessed observations, such as that produced by
:func:`mfobs.obs.get_base_obs`. Must have the following columns,
in addition to columns of floating point data to aggregate
(which can have any name):
=================== =============================================================
datetime pandas datetimes, based on stress period end date
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name
=================== =============================================================
1) where obsprefix is assumed to be formatted as <site_no>-<variable> or simply <site_no>
2) Assumed to be formatted <obsprefix>_<`obsnme_suffix_format`>
obsnme_prefix_variable : str
Variable name indicating the type of observation. Is appended to the site number (site_no)
to create new observation name prefixes (obsprefixes) in the format of <site_no>-<variable>.
For example, by default, an observation prefix in base_data of '04026300-flow' or '04026300'
(both for a site_no of '04026300'), would result in a new observation prefix of
'04026300-baseflow'. A base flow computed from a total flow observation named '04026300-flow_20221103'
or '04026300_20221103' (at site '04026300' on 2022/11/03) would be named '04026300-baseflow_20221103'.
by default, 'baseflow'.
obsnme_suffix_format : str, optional
Format for suffix of obsnmes.
By default '%Y%m%d' (e.g. 20010101 for Jan 1, 2001)
exclude_suffix : str or list-like
Option to exclude observations from processing by suffix;
e.g. 'ss' to include steady-state observations.
By default, 'ss'
obgnme_suffix : str
Create new observation group names by appending this suffix to existing
obgnmes, for example <existing obgnme>_<obgnme_suffix>
by default, 'baseflow'
missing_obs_fill_value : float
Baseflow observations are different from most other observation types in that
they are produced by hydrograph separation, which can lead to different numbers
of observations with changing conditions. For example, a given parameter set
may produce a total flow hydrograph with a different number of ordinates (turning points),
which in turn can change the interpolation of baseflow values between ordinates, potentially
producing either 'extra' observations (not in the instruction file) or missing observations.
Missing observations will be filled with `missing_obs_value`, so that PEST doesn't crash. A
value of 0. is recommended, as missing observations are often associated with low or no-flow
periods, especially near the start or end of a timeseries.
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
**kwargs : key-word arguments to :func:`mfobs.sep.ih_method`
Returns
-------
bf_base_data : DataFrame
With the following columns (in addition to the data columns in `base_data`,
which now contain the aggregated values):
=================== =============================================================
datetime observation dates as datetimes
site_no unique site identifier
obsprefix\ :sup:`1` prefix of observation name
obsnme observation name based on format of <obsprefix>_<suffix>\ :sup:`2`
obgnme observation group name\ :sup:`3`
=================== =============================================================
1) With format <site_no>-<variable> or simply <site_no>
2) With suffix formatted with `obsnme_suffix_format`
3) With format of <original obgnme>_<obgnme_suffix>
e.g. heads_annual-mean
Notes
-----
"""
base_data['datetime'] = pd.to_datetime(base_data['datetime'])
# only compute statistics on transient obs
if isinstance(exclude_suffix, str):
exclude_suffix = [exclude_suffix]
suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
keep = ~np.isin(suffix, exclude_suffix)
base_data = base_data.loc[keep].copy()
bf_base_data = base_data.copy()
data_cols = [c for c, dtype in base_data.dtypes.items() if 'float' in dtype.name]
bf_base_data.index = pd.to_datetime(bf_base_data['datetime'])
dfs = []
site_groups = bf_base_data.groupby('obsprefix')
for obsprefix, group in site_groups:
for c in data_cols:
# series must be at least 2x the BFI block length (default=5)
block_length = kwargs.get('block_length', 5)
if len(group[c]) >= (block_length * 2):
results = ih_method(group[c], **kwargs)
group[c] = results['QB']
dfs.append(group)
bf_base_data = pd.concat(dfs)
# drop nan values
bf_base_data.dropna(subset=['obsval', 'sim_obsval'], axis=0, inplace=True)
bf_base_data['obsprefix'] = [f"{site_no}-{obsnme_prefix_variable}"
for site_no in bf_base_data['site_no']]
bf_base_data['obsnme'] = [f"{prefix}_{dt:{obsnme_suffix_format}}".lower()
for prefix, dt in zip(bf_base_data['obsprefix'],
bf_base_data['datetime'])]
bf_base_data['obgnme'] = [f"{prefix}_{obgnme_suffix}"
for prefix in bf_base_data['obgnme']]
cols = ['datetime', 'site_no', 'obsprefix', 'obsnme'] + data_cols + ['obgnme']
bf_base_data = bf_base_data[cols]
if outfile is not None:
# write the instruction file
if write_ins:
write_insfile(bf_base_data, str(outfile) + '.ins',
obsnme_column='obsnme', simulated_obsval_column='sim_obsval',
index=False)
# otherwise, normalize observations to the instruction file
else:
insfile_obs = get_insfile_observations(str(outfile) + '.ins')
missing_in_output = set(insfile_obs).difference(bf_base_data['obsnme'])
extra_in_output = set(bf_base_data['obsnme']).difference(insfile_obs)
# fill missing observations
if any(missing_in_output):
bf_base_data.index = bf_base_data['obsnme']
# re-index the observation data to add rows for the missing obs
reindexed = bf_base_data.reindex(insfile_obs)
# identify rows containing missing obs
filled = reindexed['sim_obsval'].isna()
# refill columns for the missing obs
reindexed['obsnme'] = reindexed.index
reindexed.loc[filled, 'datetime'] = [pd.Timestamp(s.split('_')[1].split('-')[0])
for s in reindexed.loc[filled, 'obsnme']]
reindexed.loc[filled, 'site_no'] = [s.split('_')[0].split('-')[0]
for s in reindexed.loc[filled, 'obsnme']]
reindexed.loc[filled, 'obsprefix'] = [s.split('_')[0]
for s in reindexed.loc[filled, 'obsnme']]
reindexed.loc[filled, 'obgnme'] = 'filled-bf'
reindexed.loc[filled, data_cols] = missing_obs_fill_value
assert not reindexed.isna().any().any()
bf_base_data = reindexed
# drop any extra observations
if any(extra_in_output):
bf_base_data = bf_base_data.loc[~bf_base_data.index.isin(extra_in_output)]
bf_base_data.to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(bf_base_data):,} observations to {outfile}')
return bf_base_data