"""
Functions for working with shapefiles.
"""
from distutils.version import LooseVersion
import warnings
from pathlib import Path
import os
import collections
import shutil
import fiona
from shapely.geometry import shape, mapping
import numpy as np
import pandas as pd
import pyproj
from pyproj.enums import WktVersion
from gisutils.projection import get_authority_crs, project, get_shapefile_crs
from gisutils.utils import is_sequence
[docs]def df2shp(dataframe, shpname, geo_column='geometry', index=False,
retain_order=False,
prj=None, epsg=None, proj_str=None, crs=None):
"""Write a DataFrame with a column of shapely geometries to a shapefile.
Parameters
----------
dataframe : pandas.DataFrame
shpname : str, filepath
Output shapefile
geo_column : str
Name of column in dataframe with feature geometries (default 'geometry')
index : bool
If True, include the DataFrame index in the written shapefile
retain_order : bool
Retain column order in dataframe, using an OrderedDict. Shapefile will
take about twice as long to write, since OrderedDict output is not
supported by the pandas DataFrame object.
prj : str
Path to ESRI projection file describing the coordinate reference system of the feature geometries
in the 'geometry' column. (specify one of prj, epsg, proj_str)
epsg : int
EPSG code describing the coordinate reference system of the feature geometries
in the 'geometry' column.
proj_str : str
PROJ string describing the coordinate reference system of the feature geometries
in the 'geometry' column.
crs : obj
A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
passed to the :meth:`pyproj.crs.CRS.from_user_input`
See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
Can be any of:
- PROJ string
- Dictionary of PROJ parameters
- PROJ keyword arguments for parameters
- JSON string with PROJ parameters
- CRS WKT string
- An authority string [i.e. 'epsg:4326']
- An EPSG integer code [i.e. 4326]
- A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
- An object with a `to_wkt` method.
- A :class:`pyproj.crs.CRS` class
Returns
-------
writes a shapefile to shpname
"""
# first check if output path exists
output_folder = os.path.abspath(os.path.split(shpname)[0])
if os.path.split(shpname)[0] != '' and not os.path.isdir(output_folder):
raise IOError("Output folder doesn't exist:\n{}".format(output_folder))
# check for empty dataframe
if len(dataframe) == 0:
raise IndexError("DataFrame is empty!")
df = dataframe.copy() # make a copy so the supplied dataframe isn't edited
# reassign geometry column if geo_column is special (e.g. something other than "geometry")
if geo_column != 'geometry':
df['geometry'] = df[geo_column]
df.drop(geo_column, axis=1, inplace=True)
# assign none for geometry, to write a dbf file from dataframe
Type = None
if 'geometry' not in df.columns:
df['geometry'] = None
Type = 'None'
mapped = [None] * len(df)
# reset the index to integer index to enforce ordering
# retain index as attribute field if index=True
df.reset_index(inplace=True, drop=not index)
# enforce 10 character limit
df.columns = rename_fields_to_10_characters(df.columns)
properties = shp_properties(df)
del properties['geometry']
# set projection (or use a prj file, which must be copied after shp is written)
# alternatively, provide a crs in dictionary form as read using fiona
# from a shapefile like fiona.open(inshpfile).crs
crs_wkt = None
if epsg is not None:
warnings.warn('gisutils.df2shp: the epsg argument is deprecated; use crs instead',
DeprecationWarning)
crs = get_authority_crs(int(epsg))
elif proj_str is not None:
warnings.warn('gisutils.df2shp: the proj_str argument is deprecated; use crs instead',
DeprecationWarning)
crs = get_authority_crs(proj_str)
if crs is not None:
proj_crs = get_authority_crs(crs)
# https://pyproj4.github.io/pyproj/stable/crs_compatibility.html#converting-from-pyproj-crs-crs-for-fiona
if LooseVersion(fiona.__gdal_version__) < LooseVersion("3.0.0"):
crs_wkt = proj_crs.to_wkt(WktVersion.WKT1_GDAL)
else:
# GDAL 3+ can use WKT2
crs_wkt = proj_crs.to_wkt()
crs = None
else:
pass
if Type != 'None':
for g in df.geometry:
try:
Type = g.type
except:
continue
mapped = [mapping(g) for g in df.geometry]
schema = {'geometry': Type, 'properties': properties}
length = len(df)
if not retain_order:
props = df.drop('geometry', axis=1).astype(object).to_dict(orient='records')
else:
props = [collections.OrderedDict(r) for i, r in df.drop('geometry', axis=1).astype(object).iterrows()]
print('writing {}...'.format(shpname), end='')
#with fiona.collection(shpname, "w", driver="ESRI Shapefile", crs=crs, crs_wkt=crs_wkt, schema=schema) as output:
with fiona.open(shpname, "w", driver="ESRI Shapefile", crs=crs, crs_wkt=crs_wkt, schema=schema) as output:
for i in range(length):
output.write({'properties': props[i],
'geometry': mapped[i]})
if prj is not None:
try:
print('copying {} --> {}...'.format(prj, "{}.prj".format(shpname[:-4])))
shutil.copyfile(prj, "{}.prj".format(shpname[:-4]))
except IOError:
print('Warning: could not find specified prj file. shp will not be projected.')
print(' Done')
[docs]def rename_fields_to_10_characters(columns, limit=10):
fields = list(map(str, columns)) # convert columns to strings in case some are ints
newfields = []
for s in (fields):
if s[:limit] not in newfields:
newfields.append(s[:limit])
else:
for i in range(100):
if i < 10:
if '{}{}'.format(s[:limit-1], str(i)) not in newfields:
newfields.append(s[:limit-1] + str(i))
break
elif i < 100:
if '{}{}'.format(s[:limit-2], str(i)) not in newfields:
newfields.append(s[:limit-2] + str(i))
break
return newfields
[docs]def shp2df(shplist, index=None, index_dtype=None, clipto=[], filter=None,
true_values=None, false_values=None, layer=None, dest_crs=None,
skip_empty_geom=True):
"""Read shapefile/DBF, list of shapefiles/DBFs, or File geodatabase (GDB)
into pandas DataFrame.
Parameters
----------
shplist : string or list
of shapefile/DBF name(s) or FileGDB
index : string
Column to use as index for dataframe
index_dtype : dtype
Enforces a datatype for the index column (for example, if the index field is supposed to be integer
but pandas reads it as strings, converts to integer)
clipto : list
limit what is brought in to items in index of clipto (requires index)
filter : tuple (xmin, ymin, xmax, ymax)
bounding box to filter which records are read from the shapefile.
true_values : list
same as argument for pandas read_csv
false_values : list
same as argument for pandas read_csv
layer : str
Layer name to read (if opening FileGDB)
dest_crs : obj
A Python int, dict, str, or pyproj.crs.CRS instance
passed to the pyproj.crs.from_user_input
See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
Can be any of:
- PROJ string
- Dictionary of PROJ parameters
- PROJ keyword arguments for parameters
- JSON string with PROJ parameters
- CRS WKT string
- An authority string [i.e. 'epsg:4326']
- An EPSG integer code [i.e. 4326]
- A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
- An object with a `to_wkt` method.
- A :class:`pyproj.crs.CRS` class
skip_empty_geom : True/False, default True
Drops shapefile entries with null geometries.
DBF files (which specify null geometries in their schema) will still be read.
Returns
-------
df : DataFrame
with attribute fields as columns; feature geometries are stored as
shapely geometry objects in the 'geometry' column.
"""
if isinstance(shplist, str) or isinstance(shplist, Path):
shplist = [shplist]
if not isinstance(true_values, list) and true_values is not None:
true_values = [true_values]
if not isinstance(false_values, list) and false_values is not None:
false_values = [false_values]
if len(clipto) > 0 and index:
clip = True
else:
clip = False
# destination crs for geometries read from shapefile(s)
if dest_crs is not None:
dest_crs = get_authority_crs(dest_crs)
df = pd.DataFrame()
for shp in shplist:
print("\nreading {}...".format(shp))
if not os.path.exists(shp):
raise IOError("{} doesn't exist".format(shp))
# crs of current shapefile
shp_crs = get_shapefile_crs(shp)
# set the destination CRS if none was specified
# so that heterogenious shapefiles will be output to
# the same CRS
if dest_crs is None and shp_crs is not None:
dest_crs = shp_crs
with fiona.open(shp, 'r', layer=layer) as shp_obj:
if index is not None:
# handle capitolization issues with index field name
fields = list(shp_obj.schema['properties'].keys())
index = [f for f in fields if index.lower() == f.lower()][0]
attributes = []
# for reading in shapefiles
meta = shp_obj.meta
if meta['schema']['geometry'] != 'None':
if filter is not None:
print('filtering on bounding box {}, {}, {}, {}...'.format(*filter))
if clip: # limit what is brought in to items in index of clipto
for line in shp_obj.filter(bbox=filter):
props = line['properties']
if not props[index] in clipto:
continue
props['geometry'] = line.get('geometry', None)
attributes.append(props)
else:
for line in shp_obj.filter(bbox=filter):
props = line['properties']
props['geometry'] = line.get('geometry', None)
attributes.append(props)
print('--> building dataframe... (may take a while for large shapefiles)')
shp_df = pd.DataFrame(attributes)
# reorder fields in the DataFrame to match the input shapefile
if len(attributes) > 0:
shp_df = shp_df[list(attributes[0].keys())]
# handle null geometries
if len(shp_df) == 0:
print('Empty dataframe! No clip_features were read.')
if filter is not None:
print('Check filter {} for consistency \
with shapefile coordinate system'.format(filter))
# shp_df will only have a geometry column if it isn't empty
else:
geoms = shp_df.geometry.tolist()
if skip_empty_geom:
null_geoms = [i for i, g in enumerate(geoms) if g is None]
shp_df.drop(null_geoms, axis=0, inplace=True)
shp_df['geometry'] = [shape(g) for g in shp_df.geometry.tolist()]
else:
shp_df['geometry'] = [shape(g) if g is not None else None
for g in geoms]
# for reading in DBF files (just like shps, but without geometry)
else:
if clip: # limit what is brought in to items in index of clipto
for line in shp_obj:
props = line['properties']
if not props[index] in clipto:
continue
attributes.append(props)
else:
for line in shp_obj:
attributes.append(line['properties'])
print('--> building dataframe... (may take a while for large shapefiles)')
shp_df = pd.DataFrame(attributes)
# reorder fields in the DataFrame to match the input shapefile
if len(attributes) > 0:
shp_df = shp_df[list(attributes[0].keys())]
if len(shp_df) == 0:
continue
# set the dataframe index from the index column
if index is not None:
if index_dtype is not None:
shp_df[index] = shp_df[index].astype(index_dtype)
shp_df.index = shp_df[index].values
# reproject geometries to dest_crs if needed
if shp_crs is not None and dest_crs is not None and shp_crs != dest_crs:
shp_df['geometry'] = project(shp_df['geometry'], shp_crs, dest_crs)
df = df.append(shp_df)
# convert any t/f columns to numpy boolean data
if true_values is not None or false_values is not None:
replace_boolean = {}
for t in true_values:
replace_boolean[t] = True
for f in false_values:
replace_boolean[f] = False
# only remap columns that have values to be replaced
cols = [c for c in df.columns if c != 'geometry']
for c in cols:
if len(set(replace_boolean.keys()).intersection(set(df[c]))) > 0:
df[c] = df[c].map(replace_boolean)
return df
[docs]def shp_properties(df):
newdtypes = {'bool': 'str',
'object': 'str',
'datetime64[ns]': 'str'}
# fiona/OGR doesn't like numpy ints
# shapefile doesn't support 64 bit ints,
# but apparently leaving the ints alone is more reliable
# than intentionally downcasting them to 32 bit
# pandas is smart enough to figure it out on .to_dict()?
for c in df.columns:
if c != 'geometry':
df[c] = df[c].astype(newdtypes.get(df.dtypes[c].name,
df.dtypes[c].name))
if 'int' in df.dtypes[c].name:
if np.max(np.abs(df[c])) > 2**31 -1:
df[c] = df[c].astype(str)
# strip dtypes to just 'float', 'int' or 'str'
def stripandreplace(s):
return ''.join([i for i in s
if not i.isdigit()]).replace('object', 'str')
dtypes = [stripandreplace(df[c].dtype.name)
if c != 'geometry'
else df[c].dtype.name for c in df.columns]
properties = collections.OrderedDict(list(zip(df.columns, dtypes)))
return properties