Source code for mfobs.prms

import datetime as dt
import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix



[docs]
def read_statvar_file(statvar_file):
    """Read a PRMS statvar file into a pandas dataframe.

    Parameters
    ----------
    statvar_file : str or pathlike
    
    Returns
    -------
    df : pandas DataFrame (n days x n sites)
        With a datetime index (one row per day)
        and columns: *site1, site2, ...*
        
        Column names are formulated as variable-segment, for example,
        *seg_outflow-1527* for stream segment outflow at segment 1527.
    
    """
    with open(statvar_file) as src:
        n_sites = int(next(iter(src)).strip())
        site_names = []
        for i in range(n_sites):
            variable, segment = next(iter(src)).strip().split()
            # remove underscores,
            # which are used as delimiter in observation names
            variable = variable.replace('_', '')
            segment = int(segment)
            site_names.append(f"{segment}-{variable}")
        parse = lambda x: dt.datetime.strptime(x, '%Y %m %d %H %M %S')
        df = pd.read_csv(src, header=None,
                  sep='\\s+', 
                  parse_dates={'datetime': list(range(1, 7))}, date_parser=parse,
                  index_col='datetime')
    df.columns = ['time'] + site_names
    return df

    
    

[docs]
def get_prms_statvar_obs(perioddata,
                         statvar_file,
                         statvar_sitenames=None,
                         obsnme_date_suffix=True,
                         obsnme_suffix_format='%Y%m%d',
                         abs=False):
    """Read raw PRMS statvar observation output from text file table with
    times along the row axis and observation sites along the column axis. Reshape
    (stack) results to be n times x n sites rows, with a single observation value
    in each row. 
    
    If there is more than one time in a stress period, retain only
    the last time (so that there is one observation per stress period for each site.
    

    Parameters
    ----------
    perioddata : DataFrame
        DataFrame with start/end dates for stress periods. Must have columns
        'per' (stress period number), 'time' (modflow time, in days), 
        'start_datetime' (start date for the stress period)
        and 'end_datetime' (end date for the stress period).
    model_output_file : str
        Path to MODFLOW-6 observation csv output (shape: n times rows x n obs columns).
    statvar_sitenames : dict
        Dictionary of the site names (values) associated with each HRU or stream segment 
        (keys) referenced in the statvar file.
    obsnme_date_suffix : bool
        If true, give observations a date-based suffix. Otherwise, assign a 
        elapsed time-based suffix. In either case, the format of the suffix
        is controlled by obsnme_suffix_format.
        by default True
    obsnme_suffix_format : str, optional
        Format for suffix of obsnmes. Observation names are created following the format of
        <obsprefix>_<date or elapsed time suffix>. By default, ``'%Y%m'``,
        which would yield ``'202001'`` for a Jan, 2020 observation 
        (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
        should be a decimal format in the "new-style" string format
        (e.g. '{:03d}', which would yield ``'001'`` for an elapsed time of 1 day.)
    abs : bool, optional
        Option to convert simulated values to absolute values

    Returns
    -------
    results : DataFrame
        DataFrame with one head observation per row, with the following columns:

        =================== =============================================================
        per                 zero-based model stress period
        site_no             unique identifier for each site
        variable            PRMS variable name
        obsprefix           prefix of observation name (site identifier)
        sim_value           simulated values
        datetime            pandas datetimes, based on stress period start date
        obsnme              observation name based on format of <obsprefix>_'%Y%m'

        =================== =============================================================

        Example observation names:

        site1000_202001, for a Jan. 2020 observation at site1000 (obsnme_date_suffix=True)
        
        site1000_00001, for a day 1 observation at site1000 (obsnme_date_suffix=False)


    """
    # validation checks
    check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, 
                        function_name='read_statvar_file')
    
    if perioddata.index.name == 'per':
        perioddata = perioddata.sort_index()
    else:
        perioddata = perioddata.sort_values(by='per')
    perioddata = perioddata.copy()
    perioddata['start_datetime'] = pd.to_datetime(perioddata['start_datetime'])
    perioddata['end_datetime'] = pd.to_datetime(perioddata['end_datetime'])
        
    if 'perlen' not in perioddata.columns:
        perioddata['perlen'] = perioddata['time'].diff().fillna(0).tolist()
    print('reading model output from {}...'.format(statvar_file))
    model_output = read_statvar_file(statvar_file)

    # convert all observation names to lower case
    model_output.columns = model_output.columns.str.lower()

    # add stress period information to model output
    # update the last time in perioddata to the last statvar time
    if perioddata['end_datetime'].iloc[-1] < model_output.index[-1]:
        perioddata['end_datetime'].iloc[-1] = model_output.index[-1]
    model_output['per'] = 0
    for i, r in perioddata.iterrows():
        # no PRMS output should be applied to steady-state periods
        if r['steady']:
            continue
        model_output.loc[r['start_datetime']:r['end_datetime'], 'per'] = r.per

    # reshape the model output from (nper rows, nsites columns) to nper x nsites rows
    periods = dict(zip(model_output.index, model_output['per']))
    times = dict(zip(model_output.index, model_output['time']))
    simval_col = 'sim_obsval'
    stacked = model_output.drop(['time', 'per'], axis=1).stack(level=0).reset_index()
    stacked.columns = ['datetime', 'obsprefix', simval_col]
    stacked['variable'] = [s.split('-')[1] for s in stacked['obsprefix']]
    if statvar_sitenames is None:
        statvar_sitenames = {}

    hrus = [int(s.split('-')[0]) for s in stacked['obsprefix']]
    sitenames = [statvar_sitenames.get(hru, hru) for hru in hrus]
    sitenames = [s.lower() if isinstance(s, str) else s for s in sitenames]
    stacked['site_no'] = sitenames
    variables = [s.split('-')[1] for s in stacked['obsprefix']]
    stacked['variable'] = variables
    stacked['obsprefix'] = [f"{sitename}-{variable}".lower() 
                        for sitename, variable in zip(sitenames, variables)]
    stacked['per'] = [periods[ts] for ts in stacked['datetime']]
    stacked['time'] = [times[ts] for ts in stacked['datetime']]

    # optionally convert simulated values to absolute values
    if abs:
        stacked[simval_col] = stacked[simval_col].abs()

    # assign obsnames using the prefixes (location identifiers) and month
    obsnames = []
    for prefix, per, dt in zip(stacked.obsprefix, stacked.per, stacked.datetime):
        if obsnme_date_suffix and not pd.isnull(dt):
            name = f"{prefix}_{dt.strftime(obsnme_suffix_format)}"
        elif not obsnme_date_suffix:
            suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"
            name = f"{prefix}_{suffix}"
        else:
            name = prefix
        obsnames.append(name)
    stacked['obsnme'] = obsnames
    if stacked['obsnme'].duplicated().any():
        duplicates = stacked.loc[stacked['obsnme']. \
            duplicated(keep=False)].sort_values(by='obsnme')
        msg = ("mfobs.prms.get_prms_statvar_obs:"
               "Duplicate observation names. If obsnme_date_suffix=True, "
               "you may need a more specific obsnme_suffix_format, e.g. '%Y%m%d\n"
               ""
               )
        raise ValueError(msg)
    stacked.index = stacked['obsnme']
    sort_cols = [c for c in ['obsprefix', 'per', 'layer'] if c in stacked.columns]
    stacked.sort_values(by=sort_cols, inplace=True)
    results = stacked[['datetime', 'site_no', 'variable', 'obsprefix', 'obsnme',
                        'sim_obsval']].copy()
    return results