"""
Functions for processing head observations
"""
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix
from mfobs.fileio import load_array, write_insfile
from mfobs.modflow import get_mf6_single_variable_obs, get_ij, get_transmissivities
from mfobs.utils import fill_nats, set_period_start_end_dates
[docs]
def get_head_obs(perioddata, modelgrid_transform, model_output_file,
observed_values_file,
gwf_obs_input_file,
observed_values_metadata_file=None,
variable_name='head',
observed_values_site_id_col='obsprefix',
observed_values_datetime_col='datetime',
obsnme_date_suffix=True,
obsnme_suffix_format='%Y%m',
observed_values_obsval_col='obsval',
observed_values_x_col='x',
observed_values_y_col='y',
observed_values_screen_top_col='screen_top',
observed_values_screen_botm_col='screen_botm',
observed_values_layer_col=None,
observed_values_group_column='obgnme',
observed_values_unc_column='uncertainty',
aggregrate_observed_values_by='mean',
gwf_obs_block=None,
drop_groups=None,
hk_arrays=None, top_array=None, botm_arrays=None,
label_period_as_steady_state=None, steady_state_period_start=None,
steady_state_period_end=None, forecast_sites=None,
forecast_start_date=None, forecast_end_date=None,
forecasts_only=False, forecast_sites_only=False,
min_open_interval_frac_in_model=0.5,
verbose=True, write_ins=False, outfile=None):
"""Post-processes model output to be read by PEST, and optionally,
writes a corresponding PEST instruction file. Reads model output
using get_mf6_single_variable_obs(). General paradigm is to include all model
layers in the MODFLOW input for each observation, and then post-process the model
results to a single value by computing a transmissivity-weighted average.
Observation names to match observed values to their simulated equivalents are constructed
in the format of <obsprefix>_<date suffix>, where obsprefix is a site identifier taken
from the ``observed_values_site_id_col`` in ``observed_values_file``. In creating
observation names for MODFLOW output, the column names in the observation CSV output
are used for the prefixes. Therefore, the identifiers in ``observed_values_site_id_col``
should correspond to observations in the MODFLOW observation input. The date suffix
is formatted using the ``obsnme_date_suffix_format`` parameter, which is also
passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs` for assigning observation
names to the MODFLOW observation output.
Optionally, a model stress period can be labeled as steady-state (``label_period_as_steady_state``),
representing average conditions over a time period bracked by a ``steady_state_period_start`` and
``steady_state_period_end``. In this case, the simulated values for the labeled stress period are
matched to average values for the steady-state time period.
Parameters
----------
perioddata : str
Path to csv file with start/end dates for stress periods. Must have columns
'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
and 'end_datetime' (end date for the stress period).
modelgrid_transform : str
An `affine.Affine <https://github.com/sgillies/affine>`_ object describing the orientation
of the model grid. Modflow-setup :class:`~mfsetup.grid.MFsetupGrid` have this attached
via the :meth:`~mfsetup.grid.MFsetupGrid.transform` property. Example::
modelgrid_transform=affine.Affine(1000.0, 0.0, 500955,
0.0, -1000.0, 1205285)
for a uniform spacing of 1000 and upper left corner of 500955, 1205285
with a rotation of 45 degrees, counter-clockwise about the upper left corner::
modelgrid_transform=affine.Affine(1000.0, 0.0, 500955,
0.0, -1000.0, 1205285).rotation(45.)
An ``affine.Affine`` instance can also be created from a
`Modflow-setup <https://github.com/aleaf/modflow-setup>`_
grid JSON file via the :func:`~mfobs.modflow.get_modelgrid_transform` function.
model_output_file : str
Modflow-6 head observation CSV output file.
Read by :func:`~mfobs.modflow.get_mf6_single_variable_obs`.
observed_values_file : str or DataFrame
CSV file or DataFrame with observed values. Must have the following columns
(default names are shown, other names can be specified with
observed_values_**_col variables below):
============= ========================
site_id site identifier
datetime date/time of observation
obsval observed value
============= ========================
can optionally include these columns, or this information can be supplied
in an observed_values_metadata_file, which will be joined on site_id
============= ========================
x x location
y y location
screen_top screen top elevation
screen_botm screen bottom elevation
============= ========================
If supplied, observation group and uncertainty information will be
passed through to the output ``base_data`` DataFrame:
============= ==================================
obgnme observation group
uncertainty estimated measurement uncertainty
============= ==================================
Locations and screen tops and bottoms are assumed to be in the same
CRS and length units as the model.
observed_values_metadata_file : str, optional
Site information for the observed values timeseries. Should include a
`site_id` column that is the same as observed_values_site_id_col, and any of
the following columns that are not in the observed_values_file:
============= ========================
x x location
y y location
screen_top screen top elevation
screen_botm screen bottom elevation
============= ========================
gwf_obs_input_file : str
Input file to MODFLOW-6 observation utility (contains layer information).
variable_name : str, optional
Column with simulated output will be named "sim_<variable_name",
by default 'head'
observed_values_site_id_col : str, optional
Column name in observed_values_file with site identifiers,
by default 'obsprefix'
observed_values_datetime_col : str, optional
Column name in observed_values_file with observation date/times,
by default 'datetime'
obsnme_date_suffix : bool
If true, give observations a date-based suffix. Otherwise, assign a
stress period-based suffix. In either case, the format of the suffix
is controlled by obsnme_suffix_format.
by default True
obsnme_suffix_format : str, optional
Format for suffix of obsnmes. Observation names are created following the format of
<obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
which would yield ``'202001'`` for a Jan, 2020 observation
(obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
should be a decimal format in the "new-style" string format
(e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
observed_values_obsval_col : str, optional
Column name in observed_values_file with observed values,
by default 'obsval'
observed_values_x_col : str, optional
Column name in observed_values_file with x-coordinates,
by default 'x'
observed_values_y_col : str, optional
Column name in observed_values_file with y-coordinates,
by default 'y'
observed_values_screen_top_col : str, optional
Column name in observed_values_file with screen top elevations,
by default 'screen_top'
observed_values_screen_botm_col : str, optional
Column name in observed_values_file with screen bottom elevations,
by default 'screen_botm'
observed_values_layer_col : str, optional
As an alternative to providing screen tops and bottoms, the model layer
for each observation can be specified directly via a layer column
of zero-based layer numbers.
by default None
observed_values_group_column : str, optional
Column name in observed_values_file with observation group information.
Passed through to output ``base_data`` DataFrame, otherwise not required.
by default 'obgnme'
observed_values_unc_column : str, optional
Column name in observed_values_file with observation uncertainty values.
Passed through to output ``base_data`` DataFrame, otherwise not required.
by default 'uncertainty'
aggregrate_observed_values_by : str
Method for aggregating observed values to the model stress periods,
if there are multiple observed values in a stress period. Can be any
of the method calls on the pandas
`Resampler <https://pandas.pydata.org/pandas-docs/stable/reference/resampling.html>`_
object. By default, 'mean'
gwf_obs_block : None or int
Argument to read a specific observation block or all blocks from GWF
observation utility file. Value of None returns observations from all
blocks. Integer value returns obs from a specifc block, in (zero-based)
order, from top to bottom. For example, a value of 0 would return the
first obs block, value of 1 would return the second obs block, and so
on. Modflow-setup writes all of the observation input to a single block,
but observation input can be broken out into multiple blocks (one per
file). by default, None (All blocks)
drop_groups : sequence, optional
Observation groups to exclude from output, by default None
hk_arrays : list-like of pathlikes or ndarray, optional
File paths to text arrays with hydraulic conductivity values
(ordered by model layer). Used in the transmissivity-weighted averaging.
by default None
top_array : str, pathlike or ndarray, optional
File paths to text array with model top elevations.
Used in the transmissivity-weighted averaging.
by default None
botm_arrays : str, pathlike or ndarray, optional
File paths to text arrays with model cell bottom elevations.
(ordered by model layer). Used in the transmissivity-weighted averaging.
by default None
label_period_as_steady_state : int, optional
Zero-based model stress period where observations will be
assigned the suffix 'ss' instead of a date suffix.
By default, None, in which case all model output is assigned
a date suffix based on the start date of the stress period.
Passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs`.
steady_state_period_start : str, optional
Start date for the period representing steady-state conditions.
Observations between ``steady_state_period_start`` and ``steady_state_period_end``
will be averaged to create additional observations with the suffix 'ss'.
The steady-state averages will be matched to model output from the
stress period specified by ``label_period_as_steady_state``.
By default None, in which case all observations are used
(if ``observed_values_datetime_col is None``)
or no steady-state observations
are created (``observed_values_datetime_col`` exists).
steady_state_period_end : str, optional
End date for the period representing steady-state conditions.
By default None, in which case all observations are used
(if ``observed_values_datetime_col is None``)
or no steady-state observations
are created (``observed_values_datetime_col`` exists).
forecast_sites : str or sequence, optional
At these sites, observations will be created for each simulated value,
regardless is there is an observed equivalent. Can be supplied
as a sequence of site numbers (`site_id`s) or ``'all'`` to
include all sites. By default, None (no forecast sites).
forecast_start_date : str, optional
Start date for forecast period. When forecast_sites is not
``None``, forecast observations will be generated for each
time between `forecast_start_date` and `forecast_end_date`.
By default, None (generate forecasts for any time with missing values).
forecast_end_date : str, optional
End date for forecast period. When forecast_sites is not
``None``, forecast observations will be generated for each
time between `forecast_start_date` and `forecast_end_date`.
By default, None (generate forecasts for any time with missing values).
forecasts_only : bool, optional
Use this option to only output forecast observations
(those without an observed equivalent), subject to the parameters of
`forecast_sites`, `forecast_start_date`, and `forecast_end_date`.
forecast_sites_only : bool, optional
Option to only output observations at sites specified
with `forecast_sites` (has no effect if ``forecast_sites='all'``). If
``forecasts_only=False``, the output will include forecast and non-forecast
observations (for a continuous time-series).
min_open_interval_frac_in_model : float ranging from 0 to 1, optional
Option to cull observations with less than min_open_interval_frac_in_model
fraction of their open interval within the model domain. Dropped head observation
locations will be reported in dropped_head_observations_above_or_below_model.csv.
by default, 0.5 (only retain observations with 50%
or more of their open interval within the model domain)
verbose : bool
Controls the verbosity of screen outputs. If True, warnings will be printed
when there are no observations within a stress period and forecast_sites is None
(meaning no observation results will be processed for that stress period).
by default, True
outfile : str, optional
CSV file to write output to.
By default, None (no output written)
write_ins : bool, optional
Option to write instruction file, by default False
Returns
-------
base_data : DataFrame
With the following columns:
===================== ====================================================
datetime pandas datetimes for the start of each stress period
per model stress period
obsprefix observation site identifier
obsnme observation name based on format of <obsprefix>_'%Y%m'
obs_<variable_name> observed values
sim_<variable_name> simulated observation equivalents
screen_top screen top elevation
screen_botm screen bottom elevation
===================== ====================================================
Example observation names:
site1000_202001, for a Jan. 2020 observation at site1000 (obsnme_date_suffix=True)
site1000_001, for a stress period 1 observation at site1000 (obsnme_date_suffix=False)
a steady-state stress period specified with label_period_as_steady_state
is given the suffix of 'ss'
e.g. site1000_ss
Notes
-----
All observation names and observation prefixes are converted to lower case
to avoid potential case issues.
"""
# validation checks
check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
function_name='get_head_obs')
outpath = Path('.')
if outfile is not None:
outpath = Path(outfile).parent
obs_values_column = 'obsval' # + variable_name # output column with observed values
sim_values_column = 'sim_obsval' # + variable_name # output column with simulated equivalents to observed values
perioddata = perioddata.copy()
set_period_start_end_dates(perioddata)
perioddata.index = perioddata.per
results = get_mf6_single_variable_obs(perioddata, model_output_file=model_output_file,
gwf_obs_input_file=gwf_obs_input_file,
#variable_name=variable_name,
#obsnme_date_suffix=obsnme_date_suffix,
#obsnme_suffix_format=obsnme_suffix_format,
#label_period_as_steady_state=label_period_as_steady_state,
gwf_obs_block=gwf_obs_block
)
# rename columns to their defaults
renames = {#observed_values_site_id_col: 'obsprefix',
observed_values_datetime_col: 'datetime',
observed_values_x_col: 'x',
observed_values_y_col: 'y',
observed_values_screen_top_col: 'screen_top',
observed_values_screen_botm_col: 'screen_botm',
observed_values_layer_col: 'layer',
observed_values_group_column: 'obgnme',
observed_values_unc_column: 'uncertainty'
}
if not isinstance(observed_values_file, pd.DataFrame):
observed = pd.read_csv(observed_values_file,
dtype={observed_values_site_id_col: object})
else:
observed = observed_values_file
if len(observed) == 0:
raise ValueError("No observed values to process!")
observed.rename(columns=renames, inplace=True)
observed['obsprefix'] = observed[observed_values_site_id_col].str.lower()
# read in the observed values metadata
if observed_values_metadata_file is not None:
if not isinstance(observed_values_metadata_file, pd.DataFrame):
metadata = pd.read_csv(observed_values_metadata_file,
dtype={observed_values_site_id_col: object})
else:
metadata = observed_values_metadata_file
metadata.rename(columns=renames, inplace=True)
metadata['obsprefix'] = metadata[observed_values_site_id_col].str.lower()
# join the metadata to the observed data
metadata.index = metadata['obsprefix'].values
observed.index = observed['obsprefix'].values
join_cols = [c for c in ['screen_top', 'screen_botm', 'x', 'y', 'layer']
if c in metadata.columns]
observed = observed.join(metadata[join_cols], rsuffix='_m')
assert not observed[['x', 'y']].isna().any().any()
# make a dictionary of site metadata for possible use later
temp = metadata.copy()
temp.index = temp['obsprefix']
site_info_dict = temp.to_dict()
else:
observed.index = observed['obsprefix'].values
# make a dictionary of site metadata for possible use later
temp = observed.copy()
temp.index = temp['obsprefix']
site_info_dict = temp.to_dict()
# convert obs names and prefixes to lower case
#observed['obsprefix'] = observed['obsprefix'].str.lower()
# delete some columns from the observed values file
# which result in values assigned by the function
# later being overwritten with nans
del_cols = ['datetime', 'per']
for col in del_cols:
if col in site_info_dict:
del site_info_dict[col]
del temp
# cast datetimes to pandas datetimes
# (observed data may not have a datetime col. if model is steady-state)
if 'datetime' in observed.columns:
observed['datetime'] = pd.to_datetime(observed['datetime'])
# not necessarily True
observed['steady'] = False # flag for steady-state observations
elif label_period_as_steady_state is not None:
observed['datetime'] = pd.to_datetime(
perioddata['start_datetime'][label_period_as_steady_state])
observed['steady'] = True
else:
if len(perioddata) == 1:
observed['datetime'] = pd.to_datetime(perioddata['start_datetime'][0])
observed['steady'] = True
else:
raise ValueError("Model has more than one stress period "
"but observed data have no 'datetime' column.")
# drop model results that aren't in the obs information file
# these are probably observations that aren't in the model time period
# (and therefore weren't included in the parent model calibration;
# but modflow-setup would include them in the MODFLOW observation input)
# also drop sites that are in the obs information file, but not in the model results
# these include sites outside of the model (i.e. in the inset when looking at the parent)
# no forecast observations;
# drop sites that don't have an observed/sim equivalent pair
if observed_values_metadata_file is not None:
observed_obsprefixes = set(metadata.obsprefix)
else:
observed_obsprefixes = set(observed.obsprefix)
no_info_sites = set(results.obsprefix).symmetric_difference(observed_obsprefixes)
if forecast_sites == 'all':
# forecast observations at all simulated sites
# (only drop sites that aren't simulated)
no_info_sites = observed_obsprefixes.difference(results.obsprefix)
elif forecast_sites is not None:
# remove selected forecast sites from 'no_info' sites to drop
forecast_sites = {s.lower() for s in forecast_sites}
no_info_sites = no_info_sites.difference(forecast_sites)
# dump these out to a csv
if len(no_info_sites) > 0:
print('Dropping {} sites with no information'.format(len(no_info_sites)))
dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv'
results.loc[results.obsprefix.isin(no_info_sites)].to_csv(dropped_obs_outfile,
index=False)
results = results.loc[~results.obsprefix.isin(no_info_sites)].copy()
observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy()
# get_mf6_single_variable_obs returns values for each layer
# collapse these into one value for each location, time
# by taking the transmissivity-weighted average
if observed_values_layer_col is None:
if isinstance(hk_arrays[0], str) or isinstance(hk_arrays[0], Path):
hk = load_array(hk_arrays)
else:
hk = hk_arrays
if isinstance(top_array, str) or isinstance(top_array, Path):
top = load_array(top_array)
else:
top = top_array
if isinstance(botm_arrays[0], str) or isinstance(botm_arrays[0], Path):
botm = load_array(botm_arrays)
else:
botm = botm_arrays
# get the x and y location and open interval corresponding to each head observation
if observed_values_metadata_file is not None:
x = dict(zip(metadata['obsprefix'], metadata['x']))
y = dict(zip(metadata['obsprefix'], metadata['y']))
else:
x = dict(zip(observed['obsprefix'], observed['x']))
y = dict(zip(observed['obsprefix'], observed['y']))
results['x'] = [x[obsprefix] for obsprefix in results.obsprefix]
results['y'] = [y[obsprefix] for obsprefix in results.obsprefix]
# get head values based on T-weighted average of open interval
if observed_values_layer_col is None:
if 'screen_top' in observed.columns:
if observed_values_metadata_file is not None:
screen_top = dict(zip(metadata['obsprefix'], metadata['screen_top']))
else:
screen_top = dict(zip(observed['obsprefix'], observed['screen_top']))
if 'screen_botm' not in observed.columns:
screen_botm = screen_top
elif 'screen_botm' not in observed.columns:
raise ValueError('Observed values need at least a screen top or screen bottom column')
if 'screen_botm' in observed.columns:
if observed_values_metadata_file is not None:
screen_botm = dict(zip(metadata['obsprefix'], metadata['screen_botm']))
else:
screen_botm = dict(zip(observed['obsprefix'], observed['screen_botm']))
if 'screen_top' not in observed.columns:
screen_top = screen_botm
elif 'screen_top' not in observed.columns:
raise ValueError('Observed values need at least a screen top or screen bottom column')
results['screen_top'] = [screen_top[obsprefix] for obsprefix in results.obsprefix]
results['screen_botm'] = [screen_botm[obsprefix] for obsprefix in results.obsprefix]
# for each model stress period, get the simulated values
# and the observed equivalents
observed.index = pd.to_datetime(observed.datetime)
results.index = pd.to_datetime(results.datetime)
# integer column for stress period- or timestep-based obsnme suffixes
# timestep-based observations
if 'timestep' in perioddata.columns:
perioddata['unique_timestep'] = list(range(len(perioddata)))
per_column = 'unique_timestep'
times = dict(zip(perioddata['time'], perioddata['unique_timestep']))
results[per_column] = [times[t] for t in results['time']]
# stress period-based observations
else:
per_column = 'per'
observed_simulated_combined = []
# if no datetime column is supplied with observations,
# only make steady state obs
if observed_values_datetime_col is None and \
label_period_as_steady_state is not None:
idx = label_period_as_steady_state
perioddata = perioddata[idx:idx+1]
for i, r in perioddata.iterrows():
# get the equivalent observed values
start, end = str(r['start_datetime']), str(r['end_datetime'])
if start[:4] == '2010':
j=2
# date-based suffix
if obsnme_date_suffix:
suffix = pd.Timestamp(end).strftime(obsnme_suffix_format)
# stress or timestep-based period-based suffix
else:
suffix = f"{r[per_column]:{obsnme_suffix_format.strip('{:}')}}"
# steady-state observations can represent a period
# other than the "modflow time" in the perioddata table
if r['per'] == label_period_as_steady_state:
suffix = 'ss'
if steady_state_period_start is not None:
start = str(steady_state_period_start)
if steady_state_period_end is not None:
end = str(steady_state_period_end)
# don't process observations for a steady-state period unless
# it is explicitly labeled as such and given a representative date range
#elif r['steady']:
# continue
#aggregrate_observed_values_method = 'mean'
#observed_in_period_rs = aggregrate_to_period(
# observed, start, end,
# aggregrate_observed_values_method=aggregrate_observed_values_method,
# obsnme_suffix=suffix)
#if observed_in_period_rs is None:
# if per_column == 'per':
# warnings.warn(('Stress period {}: No observations between start and '
# 'end dates of {} and {}!'.format(r['per'], start, end)))
# continue
# for now, require results to have period or unique timestep column
# otherwise, if slicing by datetimes, can run into problem of
# for example, initial steady-state periods being included
# with first transient period
# leading to a duplicate index error when trying to pivot below
#data = results.loc[start:end].copy()
data = results.loc[results[per_column] == r[per_column]].copy()
if len(data) == 0:
continue
# kludge to assign obsnmes to model results
# until head obs handling gets refactored into obs.get_base_obs
data['obsnme'] = ['{}_{}'.format(prefix.lower(), suffix)
for prefix in data.obsprefix]
data.index = data['obsnme']
observed_in_period = observed.sort_index().loc[start:end].reset_index(drop=True)
# No forecast observations and no observed values in period
if verbose and (forecast_sites is None) and (len(observed_in_period) == 0):
warnings.warn(('Stress period {}: No observations between start and '
'end dates of {} and {}!'.format(r['per'], start, end)))
continue
# If there are forecast sites and observed data in this period
elif len(observed_in_period) > 0:
observed_in_period.sort_values(by=['obsprefix', 'datetime'], inplace=True)
if 'n' not in observed_in_period.columns:
observed_in_period['n'] = 1
by_site = observed_in_period.groupby('obsprefix')
observed_in_period_rs = by_site.first()
data_cols = [c for c, dtype in observed_in_period.dtypes.items()
if 'float' in dtype.name]
for c in data_cols:
observed_in_period_rs[c] = getattr(by_site[c], aggregrate_observed_values_by)()
observed_in_period_rs['n'] = by_site.n.sum()
observed_in_period_rs['datetime'] = pd.Timestamp(end)
if observed_in_period_rs[observed_values_obsval_col].isna().any():
raise ValueError(f'Nan {observed_values_obsval_col} values in observation data')
observed_in_period_rs.reset_index(inplace=True) # put obsprefix back
missing_cols = set(observed_in_period.columns).difference(observed_in_period_rs.columns)
for col in missing_cols:
observed_in_period_rs[col] = by_site[col].first().values
observed_in_period_rs = observed_in_period_rs[observed_in_period.columns]
obsnames = ['{}_{}'.format(prefix.lower(), suffix)
for prefix in observed_in_period_rs.obsprefix]
observed_in_period_rs['obsnme'] = obsnames
observed_in_period_rs.index = observed_in_period_rs['obsnme']
# Forecast sites, but no observed data
else:
observed_in_period_rs = pd.DataFrame(columns=observed.columns)
# Simulated equivalents
# Option to get head values based on T-weighted average of open interval
if observed_values_layer_col is None:
# get a n layers x n sites array of simulated head observations
data = data.reset_index(drop=True)
heads_2d = data.pivot(columns='layer', values='sim_obsval', index='obsnme').T.values
obsnme = data.pivot(columns='layer', values='obsnme', index='obsnme').index.tolist()
# layers containing head values
# (in case there are any layers not represented in the head data)
head_layers = data['layer'].unique()
if heads_2d.shape[0] != botm.shape[0]:
heads_rs = np.ones((botm.shape[0], heads_2d.shape[1])) * np.nan
heads_rs[head_layers] = heads_2d
heads_2d = heads_rs
# x, y, screen_top and screen_botm have one value for each site
kwargs = {}
for arg in 'x', 'y', 'screen_top', 'screen_botm':
# pivot data to nsites rows x nlay columns
# positions without data are filled with nans
pivoted = data.pivot(columns='layer', values=arg, index='obsnme')
# reduce pivoted data to just one value per site by taking the mean
# (values should be the same across columns, which represent layers)
kwargs[arg] = pivoted.mean(axis=1).values
# get the row, column indices of the observations here
# so that we can include them in the output
i, j = get_ij(modelgrid_transform, kwargs.pop('x'), kwargs.pop('y'))
kwargs['i'] = i
kwargs['j'] = j
# cull observations to only those with >min_open_interval_pct_in_model
# of their open interval within the model
model_top = top[i, j]
model_botm = botm[-1, i, j]
screen_len = kwargs['screen_top'] - kwargs['screen_botm']
in_model_top = np.min([kwargs['screen_top'], model_top], axis=0)
in_model_botm = np.max([kwargs['screen_botm'], model_botm], axis=0)
in_model_len = in_model_top - in_model_botm
in_model_frac = (in_model_len / screen_len)
# handle wells with no open interval length
# (screen top == screen bottom)
keep = (in_model_frac > min_open_interval_frac_in_model) |\
(kwargs['screen_top'] < model_top) & (kwargs['screen_botm'] > model_botm)
# make a table of the wells that are being discarded
drop_wells = pd.DataFrame({
'obsnme': np.array(obsnme)[~keep],
'screen_top': kwargs['screen_top'][~keep],
'screen_botm': kwargs['screen_botm'][~keep],
'model_top': model_top[~keep],
'model_botm': model_botm[~keep],
})
drop_wells['reason'] = f"<{min_open_interval_frac_in_model:.0%} of open interval in model"
drop_wells.to_csv(outpath / 'dropped_head_observations_above_or_below_model.csv',
float_format='%.2f')
obsnme = np.array(obsnme)[keep]
heads_2d = heads_2d[:, keep].copy()
for k, v in kwargs.items():
if isinstance(v, np.ndarray) and len(v) == len(keep):
kwargs[k] = v[keep]
i = i[keep].copy()
j = j[keep].copy()
keep_observed_obsnmes = [n for n in obsnme if n in observed_in_period_rs.index]
observed_in_period_rs = observed_in_period_rs.loc[keep_observed_obsnmes].copy()
# get the transmissivity associated with each head obs
T = get_transmissivities(heads_2d, hk, top, botm,
modelgrid_transform=modelgrid_transform, **kwargs
)
# compute transmissivity-weighted average heads
Tr_frac = T / T.sum(axis=0)
Tr_frac_df = pd.DataFrame(Tr_frac.transpose())
Tr_frac_df['obsnme'] = obsnme
Tr_frac_df.to_csv(outpath / 'obs_layer_transmissivities.csv', float_format='%.2f')
mean_t_weighted_heads = np.nansum((heads_2d * Tr_frac), axis=0)
# top layer of open interval
kmin = np.argmax(Tr_frac > 0, axis=0)
# bottom layer of open interval
kmax = Tr_frac.shape[0] - np.argmax(np.flipud(Tr_frac) > 0, axis=0) - 1
# in some cases, the open interval might be mis-matched with the layering
# for example, an open interval might be primarily in layer 4,
# in a location where layer 5 is the only active layer
# this would result in a mean_t_weighted_heads value of 0
# (from the zero transmissivity in that layer)
# fill these instances with the mean of any valid heads at those locations
mean_heads = np.nanmean(heads_2d, axis=0)
misaligned = mean_t_weighted_heads == 0
mean_t_weighted_heads[misaligned] = mean_heads[misaligned]
# verify that there are no nans in the extracted head values (one per obs)
assert not np.any(np.isnan(mean_t_weighted_heads))
# add the simulated heads onto the list for all periods
mean_t_weighted_heads_df = pd.DataFrame({sim_values_column: mean_t_weighted_heads,
'i': i,
'j': j,
'min_layer': kmin,
'max_layer': kmax
},
index=obsnme)
if forecast_sites is not None:
observed_in_period_rs = observed_in_period_rs.reindex(obsnme)
obsprefix = observed_in_period_rs.index.str.split('_', expand=True).levels[0]
observed_in_period_rs['obsprefix'] = obsprefix
observed_in_period_rs['datetime'] = data['datetime'].values[0]
for col in sim_values_column, 'min_layer', 'max_layer', 'i', 'j':
observed_in_period_rs[col] = mean_t_weighted_heads_df[col]
# Alternative option to get head values for specified layers
# (or closest layer if the specified layer doesn't have obs output)
else:
any_simulated_obs = data.obsnme.isin(observed_in_period_rs.obsnme).any()
if not any_simulated_obs:
continue
sim_values = []
for obsnme, layer in zip(observed_in_period_rs.obsnme, observed_in_period_rs.layer):
obsnme_results = data.loc[obsnme]
# if a DataFrame (with simulated values for multiple layers) is returned
if len(obsnme_results.shape) == 2:
layer = obsnme_results.iloc[np.argmin(obsnme_results.layer - layer)]['layer']
sim_value = obsnme_results.iloc[layer][sim_values_column]
# Series (row) in results DataFrame with single simulated value
else:
sim_value = obsnme_results[sim_values_column]
sim_values.append(sim_value)
# add in row, column locations
i, j = get_ij(modelgrid_transform, observed_in_period_rs['x'],
observed_in_period_rs['y'])
observed_in_period_rs['i'] = i
observed_in_period_rs['i'] = j
observed_in_period_rs[sim_values_column] = sim_values
# add stress period and observed values
observed_in_period_rs['per'] = r['per']
observed_in_period_rs[obs_values_column] = observed_in_period_rs[observed_values_obsval_col]
observed_simulated_combined.append(observed_in_period_rs)
# Combined DataFrame of observed heads and simulated equivalents
head_obs = pd.concat(observed_simulated_combined)
# raise an error if there are duplicates- reindexing below will fail if this is the case
if head_obs.index.duplicated().any():
duplicated = head_obs.loc[head_obs.index.duplicated(keep=False)].sort_index()
msg = ('The following observations have duplicate names. There should only be'
'one observation per site, for each time period implied by the '
f'obsnme_date_suffix_format parameter.\n{duplicated}'
)
raise ValueError(msg)
# drop any observations in specified groups
# (e.g. lake stages that should be compared with lake package output)
if drop_groups is not None and 'obgnme' in head_obs.columns:
head_obs = head_obs.loc[~head_obs.obgnme.isin(drop_groups)].copy()
# add standard obsval and obgmne columns
#head_obs['obsval'] = head_obs[obs_values_column]
if 'obgnme' not in head_obs.columns:
head_obs['obgnme'] = variable_name
# fill forecast obs with site info from observed dataframe
if forecast_sites is not None:
for k, v in site_info_dict.items():
head_obs[k] = [v[p] for p in head_obs['obsprefix']]
head_obs['obsnme'] = head_obs.index
else:
# nans are where sites don't have observation values for that period
# or sites that are in other model (inset or parent)
head_obs.dropna(subset=[obs_values_column], axis=0, inplace=True)
# label forecasts in own group
if forecast_sites is not None:
is_forecast = head_obs[obs_values_column].isna()
head_obs.loc[is_forecast, 'obgnme'] += '-forecast'
# cull forecasts to specified date window
# and specific sites (if specified)
keep_forecasts = is_forecast.copy() #np.array([True] * len(head_obs))
if forecast_start_date is not None:
keep_forecasts = (head_obs['datetime'] >= forecast_start_date)
if forecast_end_date is not None:
keep_forecasts &= (head_obs['datetime'] <= forecast_end_date)
#drop = drop & is_forecast
#head_obs = head_obs.loc[~drop].copy()
#is_forecast = head_obs[obs_values_column].isna()
if forecast_sites != 'all':
keep_forecasts &= head_obs['obsprefix'].isin(forecast_sites)
# option to only include forecast obs
# (those without observed equivalents)
if forecasts_only:
keep = keep_forecasts
else:
keep = keep_forecasts | ~is_forecast
# option to only include output from designated forecast sites
if forecast_sites_only:
keep = keep & head_obs['obsprefix'].isin(forecast_sites)
head_obs = head_obs.loc[keep].copy()
# reorder the columns
columns = ['datetime', 'per', 'site_no', 'obsprefix', 'obsnme',
obs_values_column, sim_values_column,
'n', 'uncertainty', 'screen_top', 'screen_botm', 'layer',
'min_layer', 'max_layer', 'i', 'j', 'obgnme']
columns = [c for c in columns if c in head_obs.columns]
head_obs = head_obs[columns].copy()
if 'layer' in columns:
head_obs['layer'] = head_obs['layer'].astype(int)
# fill NaT (not a time) datetimes
fill_nats(head_obs, perioddata)
head_obs.sort_values(by=['obsprefix', 'per'], inplace=True)
if outfile is not None:
head_obs.fillna(-9999).to_csv(outfile, sep=' ', index=False)
print(f'wrote {len(head_obs):,} observations to {outfile}')
# write the instruction file
if write_ins:
write_insfile(head_obs, str(outfile) + '.ins', obsnme_column='obsnme',
simulated_obsval_column=sim_values_column, index=False)
return head_obs