Source code for mapgwm.check_input_datasets

"""Functions to test input files against those
   used for module testing by Travis CI.

   The automatic testing of the scripts uses standard
   input files in tests/data/ to make sure routines will
   run after changes are made.  These tests, however, do
   not check if there are changes to input file formats
   that can cause errors in the processing.  Functions in
   this package just test user specified input files against
   those used in the tests directory.  

   Authors should add a method for any datasets
   added to tests/data/
"""
from pathlib import Path
import numpy as np
import pandas as pd
import os
from mapgwm.headobs import preprocess_headobs, get_data


[docs]def get_header_length(sitefile, col0='SITE_BADGE'):
    with open(sitefile) as src:
        for i, line in enumerate(src):
            if '#' not in str(line) and col0 in str(line):
                return i

[docs]def compare_lists(list1, name1, list2, name2):
    """compare two lists and check for different or missing entries
       print out missing or different entries in list 2 compared to list 1

        Parameters
        ----------
        list1: list 
        name1: str - name to be printed in comparison
        list2: second list
        name2: str - name to be printed in comparison

        Returns
        -------
        passed: boolean,  returns True if files match
    """
    no_match = [x for x in list2 if x not in list1]
    missing = [x for x in list1 if x not in list2]
    passed = True
    if len(no_match) > 0:
        for x in no_match:
            print('{0} is in {2} but not in {1}'.format(x, name1, name2))
            passed = False
    elif len(missing) > 0:
        for x in missing:
            print('{0} is missing in {2} compared to {1}'.format(x, name1, name2))
            passed = False
    else:
        print('lists match: {0}, {1}'.format(name1, name2))
    return passed


[docs]def check_headobs_header(new_data_file, new_meta_file, path_to_tests='.'):
    """headobs.py reads in data from Will Asquith as two csv files
       datafile - csv with observed monthly heads
       metadata - csv with site characteristics

       test_headobs points to 
       mapgwm/tests/data/headobs/GW_monthly_stats_test.txt
       mapgwm/tests/data/headobs/GW_monthly_meta_test.txt

       Which have the top 1000 lines of datasets from August 2020.

       Parameters
       ----------
       new_data_file: path to data file to test
       new_meta_file: path to metadata file to test


       Returns
       -------
       (pass1 and pass2):  Boolean, False if files do not match
    """
       
    path = os.path.join(path_to_tests, 'tests', 'data', 'headobs')
    data_file = os.path.join(path, 'GW_monthly_stats_test.txt')
    metadata_file = os.path.join(path, 'GW_monthly_meta_test.txt')

    data_skiprows = get_header_length(data_file)
    testdata = pd.read_csv(data_file, sep='\t', skiprows=data_skiprows)
    data_skiprows = get_header_length(metadata_file)
    testmeta = pd.read_csv(metadata_file, sep='\t', skiprows=data_skiprows)

    data_skiprows = get_header_length(new_data_file)
    newdata = pd.read_csv(new_data_file, sep='\t', skiprows=data_skiprows)
    data_skiprows = get_header_length(new_meta_file)
    newmeta = pd.read_csv(new_meta_file, sep='\t', skiprows=data_skiprows)

    pass1 = compare_lists(testdata.columns.tolist(), 'Test Dataset', 
                  newdata.columns.tolist(), 'New Dataset')

    pass2 = compare_lists(testmeta.columns.tolist(), 'Test Metafile', 
                  newmeta.columns.tolist(), 'New Metafile')
    
    return (pass1 and pass2)


[docs]def check_preprocess_headobs_input(data_file, metadata_file, output_path='.'):
    test_data_path = Path(__file__).parent / 'tests/data'
    geographic_groups = [test_data_path / 'extents/CompositeHydrographArea.shp',
                         test_data_path / 'extents/MAP_generalized_regions.shp'
                         ]
    output_path = Path(output_path)
    outfile = output_path / 'preprocessed_monthly_output.csv'
    data, metadata = get_data(data_file=data_file, metadata_files=metadata_file)
    preproc_data, preproc_metadata = preprocess_headobs(data, metadata,
                                        head_data_columns=['head', 'last_head'],
                                        data_length_units='feet',
                                        active_area=test_data_path / 'extents/ms_delta.shp',
                                        source_crs=4269, dest_crs=5070,
                                        start_date='1998-04-01',
                                        geographic_groups=geographic_groups,
                                        geographic_groups_col='obsgroup',
                                        outfile=outfile)
    assert outfile.exists()
    assert Path(outfile.parent, outfile.stem + '_info.csv').exists()
    assert Path(outfile.parent, outfile.stem + '_info.shp').exists()
    assert np.all(preproc_data.columns ==
                  ['site_no', 'datetime', 'head', 'last_head', 'head_std', 'n', 'obsprefix'])
    assert not any(set(preproc_data.obsprefix).difference(preproc_metadata.obsprefix))
    assert not any({'site_no', 'x', 'y', 'screen_botm', 'screen_top',
                    'category', 'group'}.difference(preproc_metadata.columns))
    assert preproc_metadata['n'].dtype == np.integer
    # unit conversion was applied evenly
    preproc_data['head_diffs'] = np.abs(preproc_data['head'].values - preproc_data.last_head.values)
    preproc_data.sort_values(by='head_diffs', ascending=False, inplace=True)
    print(preproc_data.head(10))
    assert np.allclose(preproc_data['head'].values, preproc_data.last_head.values, atol=22)
    preproc_metadata['head_diffs'] = np.abs(preproc_metadata['head'].values - preproc_metadata.last_head.values)
    preproc_metadata.sort_values(by='head_diffs', ascending=False, inplace=True)
    print(preproc_metadata.head(10))
    assert np.allclose(preproc_metadata['head'].values, preproc_metadata.last_head.values, atol=22)

    # no negative open intervals
    assert not np.any((preproc_metadata.screen_top - preproc_metadata.screen_botm) < 0)


if __name__ == '__main__':
    data_file = os.path.join('tests', 'data', 'headobs', 'GW_monthly_stats_test.txt')
    metadata_file = os.path.join('tests', 'data', 'headobs', 'GW_monthly_meta_test.txt')

    check = check_headobs_header(data_file, metadata_file)
    if check:
        print('PASSED')
    else:
        print('FAILED')