Source code for mfobs.heads

"""
Functions for processing head observations
"""
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from mfobs.checks import check_obsnme_suffix
from mfobs.fileio import load_array, write_insfile
from mfobs.modflow import get_mf6_single_variable_obs, get_ij, get_transmissivities
from mfobs.utils import fill_nats, set_period_start_end_dates


[docs] def get_head_obs(perioddata, modelgrid_transform, model_output_file, observed_values_file, gwf_obs_input_file, observed_values_metadata_file=None, variable_name='head', observed_values_site_id_col='obsprefix', observed_values_datetime_col='datetime', obsnme_date_suffix=True, obsnme_suffix_format='%Y%m', observed_values_obsval_col='obsval', observed_values_x_col='x', observed_values_y_col='y', observed_values_screen_top_col='screen_top', observed_values_screen_botm_col='screen_botm', observed_values_layer_col=None, observed_values_group_column='obgnme', observed_values_unc_column='uncertainty', aggregrate_observed_values_by='mean', gwf_obs_block=None, drop_groups=None, hk_arrays=None, top_array=None, botm_arrays=None, label_period_as_steady_state=None, steady_state_period_start=None, steady_state_period_end=None, forecast_sites=None, forecast_start_date=None, forecast_end_date=None, forecasts_only=False, forecast_sites_only=False, min_open_interval_frac_in_model=0.5, verbose=True, write_ins=False, outfile=None): """Post-processes model output to be read by PEST, and optionally, writes a corresponding PEST instruction file. Reads model output using get_mf6_single_variable_obs(). General paradigm is to include all model layers in the MODFLOW input for each observation, and then post-process the model results to a single value by computing a transmissivity-weighted average. Observation names to match observed values to their simulated equivalents are constructed in the format of <obsprefix>_<date suffix>, where obsprefix is a site identifier taken from the ``observed_values_site_id_col`` in ``observed_values_file``. In creating observation names for MODFLOW output, the column names in the observation CSV output are used for the prefixes. Therefore, the identifiers in ``observed_values_site_id_col`` should correspond to observations in the MODFLOW observation input. The date suffix is formatted using the ``obsnme_date_suffix_format`` parameter, which is also passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs` for assigning observation names to the MODFLOW observation output. Optionally, a model stress period can be labeled as steady-state (``label_period_as_steady_state``), representing average conditions over a time period bracked by a ``steady_state_period_start`` and ``steady_state_period_end``. In this case, the simulated values for the labeled stress period are matched to average values for the steady-state time period. Parameters ---------- perioddata : str Path to csv file with start/end dates for stress periods. Must have columns 'time' (modflow time, in days), 'start_datetime' (start date for the stress period) and 'end_datetime' (end date for the stress period). modelgrid_transform : str An `affine.Affine <https://github.com/sgillies/affine>`_ object describing the orientation of the model grid. Modflow-setup :class:`~mfsetup.grid.MFsetupGrid` have this attached via the :meth:`~mfsetup.grid.MFsetupGrid.transform` property. Example:: modelgrid_transform=affine.Affine(1000.0, 0.0, 500955, 0.0, -1000.0, 1205285) for a uniform spacing of 1000 and upper left corner of 500955, 1205285 with a rotation of 45 degrees, counter-clockwise about the upper left corner:: modelgrid_transform=affine.Affine(1000.0, 0.0, 500955, 0.0, -1000.0, 1205285).rotation(45.) An ``affine.Affine`` instance can also be created from a `Modflow-setup <https://github.com/aleaf/modflow-setup>`_ grid JSON file via the :func:`~mfobs.modflow.get_modelgrid_transform` function. model_output_file : str Modflow-6 head observation CSV output file. Read by :func:`~mfobs.modflow.get_mf6_single_variable_obs`. observed_values_file : str or DataFrame CSV file or DataFrame with observed values. Must have the following columns (default names are shown, other names can be specified with observed_values_**_col variables below): ============= ======================== site_id site identifier datetime date/time of observation obsval observed value ============= ======================== can optionally include these columns, or this information can be supplied in an observed_values_metadata_file, which will be joined on site_id ============= ======================== x x location y y location screen_top screen top elevation screen_botm screen bottom elevation ============= ======================== If supplied, observation group and uncertainty information will be passed through to the output ``base_data`` DataFrame: ============= ================================== obgnme observation group uncertainty estimated measurement uncertainty ============= ================================== Locations and screen tops and bottoms are assumed to be in the same CRS and length units as the model. observed_values_metadata_file : str, optional Site information for the observed values timeseries. Should include a `site_id` column that is the same as observed_values_site_id_col, and any of the following columns that are not in the observed_values_file: ============= ======================== x x location y y location screen_top screen top elevation screen_botm screen bottom elevation ============= ======================== gwf_obs_input_file : str Input file to MODFLOW-6 observation utility (contains layer information). variable_name : str, optional Column with simulated output will be named "sim_<variable_name", by default 'head' observed_values_site_id_col : str, optional Column name in observed_values_file with site identifiers, by default 'obsprefix' observed_values_datetime_col : str, optional Column name in observed_values_file with observation date/times, by default 'datetime' obsnme_date_suffix : bool If true, give observations a date-based suffix. Otherwise, assign a stress period-based suffix. In either case, the format of the suffix is controlled by obsnme_suffix_format. by default True obsnme_suffix_format : str, optional Format for suffix of obsnmes. Observation names are created following the format of <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``, which would yield ``'202001'`` for a Jan, 2020 observation (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format should be a decimal format in the "new-style" string format (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.) observed_values_obsval_col : str, optional Column name in observed_values_file with observed values, by default 'obsval' observed_values_x_col : str, optional Column name in observed_values_file with x-coordinates, by default 'x' observed_values_y_col : str, optional Column name in observed_values_file with y-coordinates, by default 'y' observed_values_screen_top_col : str, optional Column name in observed_values_file with screen top elevations, by default 'screen_top' observed_values_screen_botm_col : str, optional Column name in observed_values_file with screen bottom elevations, by default 'screen_botm' observed_values_layer_col : str, optional As an alternative to providing screen tops and bottoms, the model layer for each observation can be specified directly via a layer column of zero-based layer numbers. by default None observed_values_group_column : str, optional Column name in observed_values_file with observation group information. Passed through to output ``base_data`` DataFrame, otherwise not required. by default 'obgnme' observed_values_unc_column : str, optional Column name in observed_values_file with observation uncertainty values. Passed through to output ``base_data`` DataFrame, otherwise not required. by default 'uncertainty' aggregrate_observed_values_by : str Method for aggregating observed values to the model stress periods, if there are multiple observed values in a stress period. Can be any of the method calls on the pandas `Resampler <https://pandas.pydata.org/pandas-docs/stable/reference/resampling.html>`_ object. By default, 'mean' gwf_obs_block : None or int Argument to read a specific observation block or all blocks from GWF observation utility file. Value of None returns observations from all blocks. Integer value returns obs from a specifc block, in (zero-based) order, from top to bottom. For example, a value of 0 would return the first obs block, value of 1 would return the second obs block, and so on. Modflow-setup writes all of the observation input to a single block, but observation input can be broken out into multiple blocks (one per file). by default, None (All blocks) drop_groups : sequence, optional Observation groups to exclude from output, by default None hk_arrays : list-like of pathlikes or ndarray, optional File paths to text arrays with hydraulic conductivity values (ordered by model layer). Used in the transmissivity-weighted averaging. by default None top_array : str, pathlike or ndarray, optional File paths to text array with model top elevations. Used in the transmissivity-weighted averaging. by default None botm_arrays : str, pathlike or ndarray, optional File paths to text arrays with model cell bottom elevations. (ordered by model layer). Used in the transmissivity-weighted averaging. by default None label_period_as_steady_state : int, optional Zero-based model stress period where observations will be assigned the suffix 'ss' instead of a date suffix. By default, None, in which case all model output is assigned a date suffix based on the start date of the stress period. Passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs`. steady_state_period_start : str, optional Start date for the period representing steady-state conditions. Observations between ``steady_state_period_start`` and ``steady_state_period_end`` will be averaged to create additional observations with the suffix 'ss'. The steady-state averages will be matched to model output from the stress period specified by ``label_period_as_steady_state``. By default None, in which case all observations are used (if ``observed_values_datetime_col is None``) or no steady-state observations are created (``observed_values_datetime_col`` exists). steady_state_period_end : str, optional End date for the period representing steady-state conditions. By default None, in which case all observations are used (if ``observed_values_datetime_col is None``) or no steady-state observations are created (``observed_values_datetime_col`` exists). forecast_sites : str or sequence, optional At these sites, observations will be created for each simulated value, regardless is there is an observed equivalent. Can be supplied as a sequence of site numbers (`site_id`s) or ``'all'`` to include all sites. By default, None (no forecast sites). forecast_start_date : str, optional Start date for forecast period. When forecast_sites is not ``None``, forecast observations will be generated for each time between `forecast_start_date` and `forecast_end_date`. By default, None (generate forecasts for any time with missing values). forecast_end_date : str, optional End date for forecast period. When forecast_sites is not ``None``, forecast observations will be generated for each time between `forecast_start_date` and `forecast_end_date`. By default, None (generate forecasts for any time with missing values). forecasts_only : bool, optional Use this option to only output forecast observations (those without an observed equivalent), subject to the parameters of `forecast_sites`, `forecast_start_date`, and `forecast_end_date`. forecast_sites_only : bool, optional Option to only output observations at sites specified with `forecast_sites` (has no effect if ``forecast_sites='all'``). If ``forecasts_only=False``, the output will include forecast and non-forecast observations (for a continuous time-series). min_open_interval_frac_in_model : float ranging from 0 to 1, optional Option to cull observations with less than min_open_interval_frac_in_model fraction of their open interval within the model domain. Dropped head observation locations will be reported in dropped_head_observations_above_or_below_model.csv. by default, 0.5 (only retain observations with 50% or more of their open interval within the model domain) verbose : bool Controls the verbosity of screen outputs. If True, warnings will be printed when there are no observations within a stress period and forecast_sites is None (meaning no observation results will be processed for that stress period). by default, True outfile : str, optional CSV file to write output to. By default, None (no output written) write_ins : bool, optional Option to write instruction file, by default False Returns ------- base_data : DataFrame With the following columns: ===================== ==================================================== datetime pandas datetimes for the start of each stress period per model stress period obsprefix observation site identifier obsnme observation name based on format of <obsprefix>_'%Y%m' obs_<variable_name> observed values sim_<variable_name> simulated observation equivalents screen_top screen top elevation screen_botm screen bottom elevation ===================== ==================================================== Example observation names: site1000_202001, for a Jan. 2020 observation at site1000 (obsnme_date_suffix=True) site1000_001, for a stress period 1 observation at site1000 (obsnme_date_suffix=False) a steady-state stress period specified with label_period_as_steady_state is given the suffix of 'ss' e.g. site1000_ss Notes ----- All observation names and observation prefixes are converted to lower case to avoid potential case issues. """ # validation checks check_obsnme_suffix(obsnme_date_suffix, obsnme_suffix_format, function_name='get_head_obs') outpath = Path('.') if outfile is not None: outpath = Path(outfile).parent obs_values_column = 'obsval' # + variable_name # output column with observed values sim_values_column = 'sim_obsval' # + variable_name # output column with simulated equivalents to observed values perioddata = perioddata.copy() set_period_start_end_dates(perioddata) perioddata.index = perioddata.per results = get_mf6_single_variable_obs(perioddata, model_output_file=model_output_file, gwf_obs_input_file=gwf_obs_input_file, #variable_name=variable_name, #obsnme_date_suffix=obsnme_date_suffix, #obsnme_suffix_format=obsnme_suffix_format, #label_period_as_steady_state=label_period_as_steady_state, gwf_obs_block=gwf_obs_block ) # rename columns to their defaults renames = {#observed_values_site_id_col: 'obsprefix', observed_values_datetime_col: 'datetime', observed_values_x_col: 'x', observed_values_y_col: 'y', observed_values_screen_top_col: 'screen_top', observed_values_screen_botm_col: 'screen_botm', observed_values_layer_col: 'layer', observed_values_group_column: 'obgnme', observed_values_unc_column: 'uncertainty' } if not isinstance(observed_values_file, pd.DataFrame): observed = pd.read_csv(observed_values_file, dtype={observed_values_site_id_col: object}) else: observed = observed_values_file if len(observed) == 0: raise ValueError("No observed values to process!") observed.rename(columns=renames, inplace=True) observed['obsprefix'] = observed[observed_values_site_id_col].str.lower() # read in the observed values metadata if observed_values_metadata_file is not None: if not isinstance(observed_values_metadata_file, pd.DataFrame): metadata = pd.read_csv(observed_values_metadata_file, dtype={observed_values_site_id_col: object}) else: metadata = observed_values_metadata_file metadata.rename(columns=renames, inplace=True) metadata['obsprefix'] = metadata[observed_values_site_id_col].str.lower() # join the metadata to the observed data metadata.index = metadata['obsprefix'].values observed.index = observed['obsprefix'].values join_cols = [c for c in ['screen_top', 'screen_botm', 'x', 'y', 'layer'] if c in metadata.columns] observed = observed.join(metadata[join_cols], rsuffix='_m') assert not observed[['x', 'y']].isna().any().any() # make a dictionary of site metadata for possible use later temp = metadata.copy() temp.index = temp['obsprefix'] site_info_dict = temp.to_dict() else: observed.index = observed['obsprefix'].values # make a dictionary of site metadata for possible use later temp = observed.copy() temp.index = temp['obsprefix'] site_info_dict = temp.to_dict() # convert obs names and prefixes to lower case #observed['obsprefix'] = observed['obsprefix'].str.lower() # delete some columns from the observed values file # which result in values assigned by the function # later being overwritten with nans del_cols = ['datetime', 'per'] for col in del_cols: if col in site_info_dict: del site_info_dict[col] del temp # cast datetimes to pandas datetimes # (observed data may not have a datetime col. if model is steady-state) if 'datetime' in observed.columns: observed['datetime'] = pd.to_datetime(observed['datetime']) # not necessarily True observed['steady'] = False # flag for steady-state observations elif label_period_as_steady_state is not None: observed['datetime'] = pd.to_datetime( perioddata['start_datetime'][label_period_as_steady_state]) observed['steady'] = True else: if len(perioddata) == 1: observed['datetime'] = pd.to_datetime(perioddata['start_datetime'][0]) observed['steady'] = True else: raise ValueError("Model has more than one stress period " "but observed data have no 'datetime' column.") # drop model results that aren't in the obs information file # these are probably observations that aren't in the model time period # (and therefore weren't included in the parent model calibration; # but modflow-setup would include them in the MODFLOW observation input) # also drop sites that are in the obs information file, but not in the model results # these include sites outside of the model (i.e. in the inset when looking at the parent) # no forecast observations; # drop sites that don't have an observed/sim equivalent pair if observed_values_metadata_file is not None: observed_obsprefixes = set(metadata.obsprefix) else: observed_obsprefixes = set(observed.obsprefix) no_info_sites = set(results.obsprefix).symmetric_difference(observed_obsprefixes) if forecast_sites == 'all': # forecast observations at all simulated sites # (only drop sites that aren't simulated) no_info_sites = observed_obsprefixes.difference(results.obsprefix) elif forecast_sites is not None: # remove selected forecast sites from 'no_info' sites to drop forecast_sites = {s.lower() for s in forecast_sites} no_info_sites = no_info_sites.difference(forecast_sites) # dump these out to a csv if len(no_info_sites) > 0: print('Dropping {} sites with no information'.format(len(no_info_sites))) dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv' results.loc[results.obsprefix.isin(no_info_sites)].to_csv(dropped_obs_outfile, index=False) results = results.loc[~results.obsprefix.isin(no_info_sites)].copy() observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy() # get_mf6_single_variable_obs returns values for each layer # collapse these into one value for each location, time # by taking the transmissivity-weighted average if observed_values_layer_col is None: if isinstance(hk_arrays[0], str) or isinstance(hk_arrays[0], Path): hk = load_array(hk_arrays) else: hk = hk_arrays if isinstance(top_array, str) or isinstance(top_array, Path): top = load_array(top_array) else: top = top_array if isinstance(botm_arrays[0], str) or isinstance(botm_arrays[0], Path): botm = load_array(botm_arrays) else: botm = botm_arrays # get the x and y location and open interval corresponding to each head observation if observed_values_metadata_file is not None: x = dict(zip(metadata['obsprefix'], metadata['x'])) y = dict(zip(metadata['obsprefix'], metadata['y'])) else: x = dict(zip(observed['obsprefix'], observed['x'])) y = dict(zip(observed['obsprefix'], observed['y'])) results['x'] = [x[obsprefix] for obsprefix in results.obsprefix] results['y'] = [y[obsprefix] for obsprefix in results.obsprefix] # get head values based on T-weighted average of open interval if observed_values_layer_col is None: if 'screen_top' in observed.columns: if observed_values_metadata_file is not None: screen_top = dict(zip(metadata['obsprefix'], metadata['screen_top'])) else: screen_top = dict(zip(observed['obsprefix'], observed['screen_top'])) if 'screen_botm' not in observed.columns: screen_botm = screen_top elif 'screen_botm' not in observed.columns: raise ValueError('Observed values need at least a screen top or screen bottom column') if 'screen_botm' in observed.columns: if observed_values_metadata_file is not None: screen_botm = dict(zip(metadata['obsprefix'], metadata['screen_botm'])) else: screen_botm = dict(zip(observed['obsprefix'], observed['screen_botm'])) if 'screen_top' not in observed.columns: screen_top = screen_botm elif 'screen_top' not in observed.columns: raise ValueError('Observed values need at least a screen top or screen bottom column') results['screen_top'] = [screen_top[obsprefix] for obsprefix in results.obsprefix] results['screen_botm'] = [screen_botm[obsprefix] for obsprefix in results.obsprefix] # for each model stress period, get the simulated values # and the observed equivalents observed.index = pd.to_datetime(observed.datetime) results.index = pd.to_datetime(results.datetime) # integer column for stress period- or timestep-based obsnme suffixes # timestep-based observations if 'timestep' in perioddata.columns: perioddata['unique_timestep'] = list(range(len(perioddata))) per_column = 'unique_timestep' times = dict(zip(perioddata['time'], perioddata['unique_timestep'])) results[per_column] = [times[t] for t in results['time']] # stress period-based observations else: per_column = 'per' observed_simulated_combined = [] # if no datetime column is supplied with observations, # only make steady state obs if observed_values_datetime_col is None and \ label_period_as_steady_state is not None: idx = label_period_as_steady_state perioddata = perioddata[idx:idx+1] for i, r in perioddata.iterrows(): # get the equivalent observed values start, end = str(r['start_datetime']), str(r['end_datetime']) if start[:4] == '2010': j=2 # date-based suffix if obsnme_date_suffix: suffix = pd.Timestamp(end).strftime(obsnme_suffix_format) # stress or timestep-based period-based suffix else: suffix = f"{r[per_column]:{obsnme_suffix_format.strip('{:}')}}" # steady-state observations can represent a period # other than the "modflow time" in the perioddata table if r['per'] == label_period_as_steady_state: suffix = 'ss' if steady_state_period_start is not None: start = str(steady_state_period_start) if steady_state_period_end is not None: end = str(steady_state_period_end) # don't process observations for a steady-state period unless # it is explicitly labeled as such and given a representative date range #elif r['steady']: # continue #aggregrate_observed_values_method = 'mean' #observed_in_period_rs = aggregrate_to_period( # observed, start, end, # aggregrate_observed_values_method=aggregrate_observed_values_method, # obsnme_suffix=suffix) #if observed_in_period_rs is None: # if per_column == 'per': # warnings.warn(('Stress period {}: No observations between start and ' # 'end dates of {} and {}!'.format(r['per'], start, end))) # continue # for now, require results to have period or unique timestep column # otherwise, if slicing by datetimes, can run into problem of # for example, initial steady-state periods being included # with first transient period # leading to a duplicate index error when trying to pivot below #data = results.loc[start:end].copy() data = results.loc[results[per_column] == r[per_column]].copy() if len(data) == 0: continue # kludge to assign obsnmes to model results # until head obs handling gets refactored into obs.get_base_obs data['obsnme'] = ['{}_{}'.format(prefix.lower(), suffix) for prefix in data.obsprefix] data.index = data['obsnme'] observed_in_period = observed.sort_index().loc[start:end].reset_index(drop=True) # No forecast observations and no observed values in period if verbose and (forecast_sites is None) and (len(observed_in_period) == 0): warnings.warn(('Stress period {}: No observations between start and ' 'end dates of {} and {}!'.format(r['per'], start, end))) continue # If there are forecast sites and observed data in this period elif len(observed_in_period) > 0: observed_in_period.sort_values(by=['obsprefix', 'datetime'], inplace=True) if 'n' not in observed_in_period.columns: observed_in_period['n'] = 1 by_site = observed_in_period.groupby('obsprefix') observed_in_period_rs = by_site.first() data_cols = [c for c, dtype in observed_in_period.dtypes.items() if 'float' in dtype.name] for c in data_cols: observed_in_period_rs[c] = getattr(by_site[c], aggregrate_observed_values_by)() observed_in_period_rs['n'] = by_site.n.sum() observed_in_period_rs['datetime'] = pd.Timestamp(end) if observed_in_period_rs[observed_values_obsval_col].isna().any(): raise ValueError(f'Nan {observed_values_obsval_col} values in observation data') observed_in_period_rs.reset_index(inplace=True) # put obsprefix back missing_cols = set(observed_in_period.columns).difference(observed_in_period_rs.columns) for col in missing_cols: observed_in_period_rs[col] = by_site[col].first().values observed_in_period_rs = observed_in_period_rs[observed_in_period.columns] obsnames = ['{}_{}'.format(prefix.lower(), suffix) for prefix in observed_in_period_rs.obsprefix] observed_in_period_rs['obsnme'] = obsnames observed_in_period_rs.index = observed_in_period_rs['obsnme'] # Forecast sites, but no observed data else: observed_in_period_rs = pd.DataFrame(columns=observed.columns) # Simulated equivalents # Option to get head values based on T-weighted average of open interval if observed_values_layer_col is None: # get a n layers x n sites array of simulated head observations data = data.reset_index(drop=True) heads_2d = data.pivot(columns='layer', values='sim_obsval', index='obsnme').T.values obsnme = data.pivot(columns='layer', values='obsnme', index='obsnme').index.tolist() # layers containing head values # (in case there are any layers not represented in the head data) head_layers = data['layer'].unique() if heads_2d.shape[0] != botm.shape[0]: heads_rs = np.ones((botm.shape[0], heads_2d.shape[1])) * np.nan heads_rs[head_layers] = heads_2d heads_2d = heads_rs # x, y, screen_top and screen_botm have one value for each site kwargs = {} for arg in 'x', 'y', 'screen_top', 'screen_botm': # pivot data to nsites rows x nlay columns # positions without data are filled with nans pivoted = data.pivot(columns='layer', values=arg, index='obsnme') # reduce pivoted data to just one value per site by taking the mean # (values should be the same across columns, which represent layers) kwargs[arg] = pivoted.mean(axis=1).values # get the row, column indices of the observations here # so that we can include them in the output i, j = get_ij(modelgrid_transform, kwargs.pop('x'), kwargs.pop('y')) kwargs['i'] = i kwargs['j'] = j # cull observations to only those with >min_open_interval_pct_in_model # of their open interval within the model model_top = top[i, j] model_botm = botm[-1, i, j] screen_len = kwargs['screen_top'] - kwargs['screen_botm'] in_model_top = np.min([kwargs['screen_top'], model_top], axis=0) in_model_botm = np.max([kwargs['screen_botm'], model_botm], axis=0) in_model_len = in_model_top - in_model_botm in_model_frac = (in_model_len / screen_len) # handle wells with no open interval length # (screen top == screen bottom) keep = (in_model_frac > min_open_interval_frac_in_model) |\ (kwargs['screen_top'] < model_top) & (kwargs['screen_botm'] > model_botm) # make a table of the wells that are being discarded drop_wells = pd.DataFrame({ 'obsnme': np.array(obsnme)[~keep], 'screen_top': kwargs['screen_top'][~keep], 'screen_botm': kwargs['screen_botm'][~keep], 'model_top': model_top[~keep], 'model_botm': model_botm[~keep], }) drop_wells['reason'] = f"<{min_open_interval_frac_in_model:.0%} of open interval in model" drop_wells.to_csv(outpath / 'dropped_head_observations_above_or_below_model.csv', float_format='%.2f') obsnme = np.array(obsnme)[keep] heads_2d = heads_2d[:, keep].copy() for k, v in kwargs.items(): if isinstance(v, np.ndarray) and len(v) == len(keep): kwargs[k] = v[keep] i = i[keep].copy() j = j[keep].copy() keep_observed_obsnmes = [n for n in obsnme if n in observed_in_period_rs.index] observed_in_period_rs = observed_in_period_rs.loc[keep_observed_obsnmes].copy() # get the transmissivity associated with each head obs T = get_transmissivities(heads_2d, hk, top, botm, modelgrid_transform=modelgrid_transform, **kwargs ) # compute transmissivity-weighted average heads Tr_frac = T / T.sum(axis=0) Tr_frac_df = pd.DataFrame(Tr_frac.transpose()) Tr_frac_df['obsnme'] = obsnme Tr_frac_df.to_csv(outpath / 'obs_layer_transmissivities.csv', float_format='%.2f') mean_t_weighted_heads = np.nansum((heads_2d * Tr_frac), axis=0) # top layer of open interval kmin = np.argmax(Tr_frac > 0, axis=0) # bottom layer of open interval kmax = Tr_frac.shape[0] - np.argmax(np.flipud(Tr_frac) > 0, axis=0) - 1 # in some cases, the open interval might be mis-matched with the layering # for example, an open interval might be primarily in layer 4, # in a location where layer 5 is the only active layer # this would result in a mean_t_weighted_heads value of 0 # (from the zero transmissivity in that layer) # fill these instances with the mean of any valid heads at those locations mean_heads = np.nanmean(heads_2d, axis=0) misaligned = mean_t_weighted_heads == 0 mean_t_weighted_heads[misaligned] = mean_heads[misaligned] # verify that there are no nans in the extracted head values (one per obs) assert not np.any(np.isnan(mean_t_weighted_heads)) # add the simulated heads onto the list for all periods mean_t_weighted_heads_df = pd.DataFrame({sim_values_column: mean_t_weighted_heads, 'i': i, 'j': j, 'min_layer': kmin, 'max_layer': kmax }, index=obsnme) if forecast_sites is not None: observed_in_period_rs = observed_in_period_rs.reindex(obsnme) obsprefix = observed_in_period_rs.index.str.split('_', expand=True).levels[0] observed_in_period_rs['obsprefix'] = obsprefix observed_in_period_rs['datetime'] = data['datetime'].values[0] for col in sim_values_column, 'min_layer', 'max_layer', 'i', 'j': observed_in_period_rs[col] = mean_t_weighted_heads_df[col] # Alternative option to get head values for specified layers # (or closest layer if the specified layer doesn't have obs output) else: any_simulated_obs = data.obsnme.isin(observed_in_period_rs.obsnme).any() if not any_simulated_obs: continue sim_values = [] for obsnme, layer in zip(observed_in_period_rs.obsnme, observed_in_period_rs.layer): obsnme_results = data.loc[obsnme] # if a DataFrame (with simulated values for multiple layers) is returned if len(obsnme_results.shape) == 2: layer = obsnme_results.iloc[np.argmin(obsnme_results.layer - layer)]['layer'] sim_value = obsnme_results.iloc[layer][sim_values_column] # Series (row) in results DataFrame with single simulated value else: sim_value = obsnme_results[sim_values_column] sim_values.append(sim_value) # add in row, column locations i, j = get_ij(modelgrid_transform, observed_in_period_rs['x'], observed_in_period_rs['y']) observed_in_period_rs['i'] = i observed_in_period_rs['i'] = j observed_in_period_rs[sim_values_column] = sim_values # add stress period and observed values observed_in_period_rs['per'] = r['per'] observed_in_period_rs[obs_values_column] = observed_in_period_rs[observed_values_obsval_col] observed_simulated_combined.append(observed_in_period_rs) # Combined DataFrame of observed heads and simulated equivalents head_obs = pd.concat(observed_simulated_combined) # raise an error if there are duplicates- reindexing below will fail if this is the case if head_obs.index.duplicated().any(): duplicated = head_obs.loc[head_obs.index.duplicated(keep=False)].sort_index() msg = ('The following observations have duplicate names. There should only be' 'one observation per site, for each time period implied by the ' f'obsnme_date_suffix_format parameter.\n{duplicated}' ) raise ValueError(msg) # drop any observations in specified groups # (e.g. lake stages that should be compared with lake package output) if drop_groups is not None and 'obgnme' in head_obs.columns: head_obs = head_obs.loc[~head_obs.obgnme.isin(drop_groups)].copy() # add standard obsval and obgmne columns #head_obs['obsval'] = head_obs[obs_values_column] if 'obgnme' not in head_obs.columns: head_obs['obgnme'] = variable_name # fill forecast obs with site info from observed dataframe if forecast_sites is not None: for k, v in site_info_dict.items(): head_obs[k] = [v[p] for p in head_obs['obsprefix']] head_obs['obsnme'] = head_obs.index else: # nans are where sites don't have observation values for that period # or sites that are in other model (inset or parent) head_obs.dropna(subset=[obs_values_column], axis=0, inplace=True) # label forecasts in own group if forecast_sites is not None: is_forecast = head_obs[obs_values_column].isna() head_obs.loc[is_forecast, 'obgnme'] += '-forecast' # cull forecasts to specified date window # and specific sites (if specified) keep_forecasts = is_forecast.copy() #np.array([True] * len(head_obs)) if forecast_start_date is not None: keep_forecasts = (head_obs['datetime'] >= forecast_start_date) if forecast_end_date is not None: keep_forecasts &= (head_obs['datetime'] <= forecast_end_date) #drop = drop & is_forecast #head_obs = head_obs.loc[~drop].copy() #is_forecast = head_obs[obs_values_column].isna() if forecast_sites != 'all': keep_forecasts &= head_obs['obsprefix'].isin(forecast_sites) # option to only include forecast obs # (those without observed equivalents) if forecasts_only: keep = keep_forecasts else: keep = keep_forecasts | ~is_forecast # option to only include output from designated forecast sites if forecast_sites_only: keep = keep & head_obs['obsprefix'].isin(forecast_sites) head_obs = head_obs.loc[keep].copy() # reorder the columns columns = ['datetime', 'per', 'site_no', 'obsprefix', 'obsnme', obs_values_column, sim_values_column, 'n', 'uncertainty', 'screen_top', 'screen_botm', 'layer', 'min_layer', 'max_layer', 'i', 'j', 'obgnme'] columns = [c for c in columns if c in head_obs.columns] head_obs = head_obs[columns].copy() if 'layer' in columns: head_obs['layer'] = head_obs['layer'].astype(int) # fill NaT (not a time) datetimes fill_nats(head_obs, perioddata) head_obs.sort_values(by=['obsprefix', 'per'], inplace=True) if outfile is not None: head_obs.fillna(-9999).to_csv(outfile, sep=' ', index=False) print(f'wrote {len(head_obs):,} observations to {outfile}') # write the instruction file if write_ins: write_insfile(head_obs, str(outfile) + '.ins', obsnme_column='obsnme', simulated_obsval_column=sim_values_column, index=False) return head_obs