Source code for gisutils.shapefile

"""
Functions for working with shapefiles.
"""
from distutils.version import LooseVersion
import warnings
from pathlib import Path
import os
import collections
import shutil
import fiona
from shapely.geometry import shape, mapping
import numpy as np
import pandas as pd
import pyproj
from pyproj.enums import WktVersion
from gisutils.projection import get_authority_crs, project, get_shapefile_crs
from gisutils.utils import is_sequence


[docs]def df2shp(dataframe, shpname, geo_column='geometry', index=False, retain_order=False, prj=None, epsg=None, proj_str=None, crs=None): """Write a DataFrame with a column of shapely geometries to a shapefile. Parameters ---------- dataframe : pandas.DataFrame shpname : str, filepath Output shapefile geo_column : str Name of column in dataframe with feature geometries (default 'geometry') index : bool If True, include the DataFrame index in the written shapefile retain_order : bool Retain column order in dataframe, using an OrderedDict. Shapefile will take about twice as long to write, since OrderedDict output is not supported by the pandas DataFrame object. prj : str Path to ESRI projection file describing the coordinate reference system of the feature geometries in the 'geometry' column. (specify one of prj, epsg, proj_str) epsg : int EPSG code describing the coordinate reference system of the feature geometries in the 'geometry' column. proj_str : str PROJ string describing the coordinate reference system of the feature geometries in the 'geometry' column. crs : obj A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to the :meth:`pyproj.crs.CRS.from_user_input` See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class Returns ------- writes a shapefile to shpname """ # first check if output path exists output_folder = os.path.abspath(os.path.split(shpname)[0]) if os.path.split(shpname)[0] != '' and not os.path.isdir(output_folder): raise IOError("Output folder doesn't exist:\n{}".format(output_folder)) # check for empty dataframe if len(dataframe) == 0: raise IndexError("DataFrame is empty!") df = dataframe.copy() # make a copy so the supplied dataframe isn't edited # reassign geometry column if geo_column is special (e.g. something other than "geometry") if geo_column != 'geometry': df['geometry'] = df[geo_column] df.drop(geo_column, axis=1, inplace=True) # assign none for geometry, to write a dbf file from dataframe Type = None if 'geometry' not in df.columns: df['geometry'] = None Type = 'None' mapped = [None] * len(df) # reset the index to integer index to enforce ordering # retain index as attribute field if index=True df.reset_index(inplace=True, drop=not index) # enforce 10 character limit df.columns = rename_fields_to_10_characters(df.columns) properties = shp_properties(df) del properties['geometry'] # set projection (or use a prj file, which must be copied after shp is written) # alternatively, provide a crs in dictionary form as read using fiona # from a shapefile like fiona.open(inshpfile).crs crs_wkt = None if epsg is not None: warnings.warn('gisutils.df2shp: the epsg argument is deprecated; use crs instead', DeprecationWarning) crs = get_authority_crs(int(epsg)) elif proj_str is not None: warnings.warn('gisutils.df2shp: the proj_str argument is deprecated; use crs instead', DeprecationWarning) crs = get_authority_crs(proj_str) if crs is not None: proj_crs = get_authority_crs(crs) # https://pyproj4.github.io/pyproj/stable/crs_compatibility.html#converting-from-pyproj-crs-crs-for-fiona if LooseVersion(fiona.__gdal_version__) < LooseVersion("3.0.0"): crs_wkt = proj_crs.to_wkt(WktVersion.WKT1_GDAL) else: # GDAL 3+ can use WKT2 crs_wkt = proj_crs.to_wkt() crs = None else: pass if Type != 'None': for g in df.geometry: try: Type = g.type except: continue mapped = [mapping(g) for g in df.geometry] schema = {'geometry': Type, 'properties': properties} length = len(df) if not retain_order: props = df.drop('geometry', axis=1).astype(object).to_dict(orient='records') else: props = [collections.OrderedDict(r) for i, r in df.drop('geometry', axis=1).astype(object).iterrows()] print('writing {}...'.format(shpname), end='') #with fiona.collection(shpname, "w", driver="ESRI Shapefile", crs=crs, crs_wkt=crs_wkt, schema=schema) as output: with fiona.open(shpname, "w", driver="ESRI Shapefile", crs=crs, crs_wkt=crs_wkt, schema=schema) as output: for i in range(length): output.write({'properties': props[i], 'geometry': mapped[i]}) if prj is not None: try: print('copying {} --> {}...'.format(prj, "{}.prj".format(shpname[:-4]))) shutil.copyfile(prj, "{}.prj".format(shpname[:-4])) except IOError: print('Warning: could not find specified prj file. shp will not be projected.') print(' Done')
[docs]def rename_fields_to_10_characters(columns, limit=10): fields = list(map(str, columns)) # convert columns to strings in case some are ints newfields = [] for s in (fields): if s[:limit] not in newfields: newfields.append(s[:limit]) else: for i in range(100): if i < 10: if '{}{}'.format(s[:limit-1], str(i)) not in newfields: newfields.append(s[:limit-1] + str(i)) break elif i < 100: if '{}{}'.format(s[:limit-2], str(i)) not in newfields: newfields.append(s[:limit-2] + str(i)) break return newfields
[docs]def shp2df(shplist, index=None, index_dtype=None, clipto=[], filter=None, true_values=None, false_values=None, layer=None, dest_crs=None, skip_empty_geom=True): """Read shapefile/DBF, list of shapefiles/DBFs, or File geodatabase (GDB) into pandas DataFrame. Parameters ---------- shplist : string or list of shapefile/DBF name(s) or FileGDB index : string Column to use as index for dataframe index_dtype : dtype Enforces a datatype for the index column (for example, if the index field is supposed to be integer but pandas reads it as strings, converts to integer) clipto : list limit what is brought in to items in index of clipto (requires index) filter : tuple (xmin, ymin, xmax, ymax) bounding box to filter which records are read from the shapefile. true_values : list same as argument for pandas read_csv false_values : list same as argument for pandas read_csv layer : str Layer name to read (if opening FileGDB) dest_crs : obj A Python int, dict, str, or pyproj.crs.CRS instance passed to the pyproj.crs.from_user_input See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class skip_empty_geom : True/False, default True Drops shapefile entries with null geometries. DBF files (which specify null geometries in their schema) will still be read. Returns ------- df : DataFrame with attribute fields as columns; feature geometries are stored as shapely geometry objects in the 'geometry' column. """ if isinstance(shplist, str) or isinstance(shplist, Path): shplist = [shplist] if not isinstance(true_values, list) and true_values is not None: true_values = [true_values] if not isinstance(false_values, list) and false_values is not None: false_values = [false_values] if len(clipto) > 0 and index: clip = True else: clip = False # destination crs for geometries read from shapefile(s) if dest_crs is not None: dest_crs = get_authority_crs(dest_crs) df = pd.DataFrame() for shp in shplist: print("\nreading {}...".format(shp)) if not os.path.exists(shp): raise IOError("{} doesn't exist".format(shp)) # crs of current shapefile shp_crs = get_shapefile_crs(shp) # set the destination CRS if none was specified # so that heterogenious shapefiles will be output to # the same CRS if dest_crs is None and shp_crs is not None: dest_crs = shp_crs with fiona.open(shp, 'r', layer=layer) as shp_obj: if index is not None: # handle capitolization issues with index field name fields = list(shp_obj.schema['properties'].keys()) index = [f for f in fields if index.lower() == f.lower()][0] attributes = [] # for reading in shapefiles meta = shp_obj.meta if meta['schema']['geometry'] != 'None': if filter is not None: print('filtering on bounding box {}, {}, {}, {}...'.format(*filter)) if clip: # limit what is brought in to items in index of clipto for line in shp_obj.filter(bbox=filter): props = line['properties'] if not props[index] in clipto: continue props['geometry'] = line.get('geometry', None) attributes.append(props) else: for line in shp_obj.filter(bbox=filter): props = line['properties'] props['geometry'] = line.get('geometry', None) attributes.append(props) print('--> building dataframe... (may take a while for large shapefiles)') shp_df = pd.DataFrame(attributes) # reorder fields in the DataFrame to match the input shapefile if len(attributes) > 0: shp_df = shp_df[list(attributes[0].keys())] # handle null geometries if len(shp_df) == 0: print('Empty dataframe! No clip_features were read.') if filter is not None: print('Check filter {} for consistency \ with shapefile coordinate system'.format(filter)) # shp_df will only have a geometry column if it isn't empty else: geoms = shp_df.geometry.tolist() if skip_empty_geom: null_geoms = [i for i, g in enumerate(geoms) if g is None] shp_df.drop(null_geoms, axis=0, inplace=True) shp_df['geometry'] = [shape(g) for g in shp_df.geometry.tolist()] else: shp_df['geometry'] = [shape(g) if g is not None else None for g in geoms] # for reading in DBF files (just like shps, but without geometry) else: if clip: # limit what is brought in to items in index of clipto for line in shp_obj: props = line['properties'] if not props[index] in clipto: continue attributes.append(props) else: for line in shp_obj: attributes.append(line['properties']) print('--> building dataframe... (may take a while for large shapefiles)') shp_df = pd.DataFrame(attributes) # reorder fields in the DataFrame to match the input shapefile if len(attributes) > 0: shp_df = shp_df[list(attributes[0].keys())] if len(shp_df) == 0: continue # set the dataframe index from the index column if index is not None: if index_dtype is not None: shp_df[index] = shp_df[index].astype(index_dtype) shp_df.index = shp_df[index].values # reproject geometries to dest_crs if needed if shp_crs is not None and dest_crs is not None and shp_crs != dest_crs: shp_df['geometry'] = project(shp_df['geometry'], shp_crs, dest_crs) df = df.append(shp_df) # convert any t/f columns to numpy boolean data if true_values is not None or false_values is not None: replace_boolean = {} for t in true_values: replace_boolean[t] = True for f in false_values: replace_boolean[f] = False # only remap columns that have values to be replaced cols = [c for c in df.columns if c != 'geometry'] for c in cols: if len(set(replace_boolean.keys()).intersection(set(df[c]))) > 0: df[c] = df[c].map(replace_boolean) return df
[docs]def shp_properties(df): newdtypes = {'bool': 'str', 'object': 'str', 'datetime64[ns]': 'str'} # fiona/OGR doesn't like numpy ints # shapefile doesn't support 64 bit ints, # but apparently leaving the ints alone is more reliable # than intentionally downcasting them to 32 bit # pandas is smart enough to figure it out on .to_dict()? for c in df.columns: if c != 'geometry': df[c] = df[c].astype(newdtypes.get(df.dtypes[c].name, df.dtypes[c].name)) if 'int' in df.dtypes[c].name: if np.max(np.abs(df[c])) > 2**31 -1: df[c] = df[c].astype(str) # strip dtypes to just 'float', 'int' or 'str' def stripandreplace(s): return ''.join([i for i in s if not i.isdigit()]).replace('object', 'str') dtypes = [stripandreplace(df[c].dtype.name) if c != 'geometry' else df[c].dtype.name for c in df.columns] properties = collections.OrderedDict(list(zip(df.columns, dtypes))) return properties