import datetime as dt
import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix
[docs]
def read_statvar_file(statvar_file):
"""Read a PRMS statvar file into a pandas dataframe.
Parameters
----------
statvar_file : str or pathlike
Returns
-------
df : pandas DataFrame (n days x n sites)
With a datetime index (one row per day)
and columns: *site1, site2, ...*
Column names are formulated as variable-segment, for example,
*seg_outflow-1527* for stream segment outflow at segment 1527.
"""
with open(statvar_file) as src:
n_sites = int(next(iter(src)).strip())
site_names = []
for i in range(n_sites):
variable, segment = next(iter(src)).strip().split()
# remove underscores,
# which are used as delimiter in observation names
variable = variable.replace('_', '')
segment = int(segment)
site_names.append(f"{segment}-{variable}")
parse = lambda x: dt.datetime.strptime(x, '%Y %m %d %H %M %S')
df = pd.read_csv(src, header=None,
sep='\\s+',
parse_dates={'datetime': list(range(1, 7))}, date_parser=parse,
index_col='datetime')
df.columns = ['time'] + site_names
return df
[docs]
def get_prms_statvar_obs(perioddata,
statvar_file,
statvar_sitenames=None,
obsnme_date_suffix=True,
obsnme_suffix_format='%Y%m%d',
abs=False):
"""Read raw PRMS statvar observation output from text file table with
times along the row axis and observation sites along the column axis. Reshape
(stack) results to be n times x n sites rows, with a single observation value
in each row.
If there is more than one time in a stress period, retain only
the last time (so that there is one observation per stress period for each site.
Parameters
----------
perioddata : DataFrame
DataFrame with start/end dates for stress periods. Must have columns
'per' (stress period number), 'time' (modflow time, in days),
'start_datetime' (start date for the stress period)
and 'end_datetime' (end date for the stress period).
model_output_file : str
Path to MODFLOW-6 observation csv output (shape: n times rows x n obs columns).
statvar_sitenames : dict
Dictionary of the site names (values) associated with each HRU or stream segment
(keys) referenced in the statvar file.
obsnme_date_suffix : bool
If true, give observations a date-based suffix. Otherwise, assign a
elapsed time-based suffix. In either case, the format of the suffix
is controlled by obsnme_suffix_format.
by default True
obsnme_suffix_format : str, optional
Format for suffix of obsnmes. Observation names are created following the format of
<obsprefix>_<date or elapsed time suffix>. By default, ``'%Y%m'``,
which would yield ``'202001'`` for a Jan, 2020 observation
(obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
should be a decimal format in the "new-style" string format
(e.g. '{:03d}', which would yield ``'001'`` for an elapsed time of 1 day.)
abs : bool, optional
Option to convert simulated values to absolute values
Returns
-------
results : DataFrame
DataFrame with one head observation per row, with the following columns:
=================== =============================================================
per zero-based model stress period
site_no unique identifier for each site
variable PRMS variable name
obsprefix prefix of observation name (site identifier)
sim_value simulated values
datetime pandas datetimes, based on stress period start date
obsnme observation name based on format of <obsprefix>_'%Y%m'
=================== =============================================================
Example observation names:
site1000_202001, for a Jan. 2020 observation at site1000 (obsnme_date_suffix=True)
site1000_00001, for a day 1 observation at site1000 (obsnme_date_suffix=False)
"""
# validation checks
check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format,
function_name='read_statvar_file')
if perioddata.index.name == 'per':
perioddata = perioddata.sort_index()
else:
perioddata = perioddata.sort_values(by='per')
perioddata = perioddata.copy()
perioddata['start_datetime'] = pd.to_datetime(perioddata['start_datetime'])
perioddata['end_datetime'] = pd.to_datetime(perioddata['end_datetime'])
if 'perlen' not in perioddata.columns:
perioddata['perlen'] = perioddata['time'].diff().fillna(0).tolist()
print('reading model output from {}...'.format(statvar_file))
model_output = read_statvar_file(statvar_file)
# convert all observation names to lower case
model_output.columns = model_output.columns.str.lower()
# add stress period information to model output
# update the last time in perioddata to the last statvar time
if perioddata['end_datetime'].iloc[-1] < model_output.index[-1]:
perioddata['end_datetime'].iloc[-1] = model_output.index[-1]
model_output['per'] = 0
for i, r in perioddata.iterrows():
# no PRMS output should be applied to steady-state periods
if r['steady']:
continue
model_output.loc[r['start_datetime']:r['end_datetime'], 'per'] = r.per
# reshape the model output from (nper rows, nsites columns) to nper x nsites rows
periods = dict(zip(model_output.index, model_output['per']))
times = dict(zip(model_output.index, model_output['time']))
simval_col = 'sim_obsval'
stacked = model_output.drop(['time', 'per'], axis=1).stack(level=0).reset_index()
stacked.columns = ['datetime', 'obsprefix', simval_col]
stacked['variable'] = [s.split('-')[1] for s in stacked['obsprefix']]
if statvar_sitenames is None:
statvar_sitenames = {}
hrus = [int(s.split('-')[0]) for s in stacked['obsprefix']]
sitenames = [statvar_sitenames.get(hru, hru) for hru in hrus]
sitenames = [s.lower() if isinstance(s, str) else s for s in sitenames]
stacked['site_no'] = sitenames
variables = [s.split('-')[1] for s in stacked['obsprefix']]
stacked['variable'] = variables
stacked['obsprefix'] = [f"{sitename}-{variable}".lower()
for sitename, variable in zip(sitenames, variables)]
stacked['per'] = [periods[ts] for ts in stacked['datetime']]
stacked['time'] = [times[ts] for ts in stacked['datetime']]
# optionally convert simulated values to absolute values
if abs:
stacked[simval_col] = stacked[simval_col].abs()
# assign obsnames using the prefixes (location identifiers) and month
obsnames = []
for prefix, per, dt in zip(stacked.obsprefix, stacked.per, stacked.datetime):
if obsnme_date_suffix and not pd.isnull(dt):
name = f"{prefix}_{dt.strftime(obsnme_suffix_format)}"
elif not obsnme_date_suffix:
suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"
name = f"{prefix}_{suffix}"
else:
name = prefix
obsnames.append(name)
stacked['obsnme'] = obsnames
if stacked['obsnme'].duplicated().any():
duplicates = stacked.loc[stacked['obsnme']. \
duplicated(keep=False)].sort_values(by='obsnme')
msg = ("mfobs.prms.get_prms_statvar_obs:"
"Duplicate observation names. If obsnme_date_suffix=True, "
"you may need a more specific obsnme_suffix_format, e.g. '%Y%m%d\n"
""
)
raise ValueError(msg)
stacked.index = stacked['obsnme']
sort_cols = [c for c in ['obsprefix', 'per', 'layer'] if c in stacked.columns]
stacked.sort_values(by=sort_cols, inplace=True)
results = stacked[['datetime', 'site_no', 'variable', 'obsprefix', 'obsnme',
'sim_obsval']].copy()
return results