-
Notifications
You must be signed in to change notification settings - Fork 5
/
delayed_vector.py
267 lines (222 loc) · 10.1 KB
/
delayed_vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import json
import logging
import os
import tempfile
from datetime import datetime, timedelta
from typing import Iterable, List, Dict
from urllib.parse import urlparse
import fiona
import geopandas as gpd
import pyproj
import requests
from shapely.geometry import shape
from shapely.geometry.base import BaseGeometry
from openeo_driver.errors import OpenEOApiException
from openeo_driver.utils import reproject_bounding_box
_log = logging.getLogger(__name__)
class DelayedVector:
"""
Represents the result of a read_vector process.
A DelayedVector essentially wraps a reference to a vector file (a path); it's delayed in that it does not load
geometries into memory until needed to avoid MemoryErrors.
DelayedVector.path contains the path.
DelayedVector.geometries loads the vector file into memory so don't do that if it contains a lot of geometries
(use path instead); DelayedVector.bounds should be safe to use.
"""
def __init__(self, path: str):
# TODO: support pathlib too?
self.path = path
self._downloaded_shapefile = None
self._crs = None
self._area = None
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.path)
def __str__(self):
return self.path
def __eq__(self, other):
return isinstance(other, type(self)) and self.path == other.path
def _load_geojson_url(self, url: str) -> dict:
_log.info(f"Loading GeoJSON from {url!r}")
resp = requests.get(url)
content_type = resp.headers.get("content-type")
content_length = resp.headers.get("content-length")
_log.info(
f"GeoJSON response: status:{resp.status_code!r}"
f" content-type:{content_type!r} content-length:{content_length!r}"
)
resp.raise_for_status()
try:
return resp.json()
except json.JSONDecodeError as e:
message = f"Failed to parse GeoJSON from URL {url!r} (content-type={content_type!r}, content-length={content_length!r}): {e!r}"
# TODO: use generic client error? https://github.com/Open-EO/openeo-api/issues/456
raise OpenEOApiException(status_code=400, message=message)
@property
def crs(self) -> pyproj.CRS:
if self._crs is None:
if self.path.startswith("http"):
if DelayedVector._is_shapefile(self.path):
local_shp_file = self._download_shapefile(self.path)
self._crs = DelayedVector._read_shapefile_crs(local_shp_file)
else: # it's GeoJSON
geojson = self._load_geojson_url(url=self.path)
# FIXME: can be cached
self._crs = DelayedVector._read_geojson_crs(geojson)
else: # it's a file on disk
if self.path.endswith(".shp"):
self._crs = DelayedVector._read_shapefile_crs(self.path)
else: # it's GeoJSON
with open(self.path, 'r') as f:
geojson = json.load(f)
self._crs = DelayedVector._read_geojson_crs(geojson)
return self._crs
@property
def geometries(self) -> Iterable[BaseGeometry]:
if self.path.startswith("http"):
if DelayedVector._is_shapefile(self.path):
local_shp_file = self._download_shapefile(self.path)
geometries = DelayedVector._read_shapefile_geometries(local_shp_file)
else: # it's GeoJSON
geojson = self._load_geojson_url(url=self.path)
geometries = DelayedVector._read_geojson_geometries(geojson)
else: # it's a file on disk
if self.path.endswith(".shp"):
geometries = DelayedVector._read_shapefile_geometries(self.path)
else: # it's GeoJSON
with open(self.path, 'r') as f:
geojson = json.load(f)
geometries = DelayedVector._read_geojson_geometries(geojson)
return geometries
@property
def area(self):
if(self._area == None):
df = self.as_geodataframe()
latlonbounds = reproject_bounding_box(dict(zip(["west", "south", "east", "north"], self.bounds)),self.crs,"EPSG:4326")
equal_area_crs = pyproj.Proj(
proj='aea',
lat_1=latlonbounds['south'],
lat_2=latlonbounds['north'])
transformed_geometry = df.geometry.to_crs(equal_area_crs.crs)
self._area = transformed_geometry.area.sum()
return self._area
def as_geodataframe(self):
"""
Loads the vector collection and returns a geopandas GeoDataFrame.
@return:
"""
return gpd.GeoDataFrame(geometry=list(self.geometries),crs=self.crs)
@property
def bounds(self) -> (float, float, float, float):
# FIXME: code duplication
if self.path.startswith("http"):
if DelayedVector._is_shapefile(self.path):
local_shp_file = self._download_shapefile(self.path)
bounds = DelayedVector._read_shapefile_bounds(local_shp_file)
else: # it's GeoJSON
geojson = self._load_geojson_url(url=self.path)
# FIXME: can be cached
bounds = DelayedVector._read_geojson_bounds(geojson)
else: # it's a file on disk
if self.path.endswith(".shp"):
bounds = DelayedVector._read_shapefile_bounds(self.path)
else: # it's GeoJSON
with open(self.path, 'r') as f:
geojson = json.load(f)
bounds = DelayedVector._read_geojson_bounds(geojson)
return bounds
@staticmethod
def from_json_dict(geojson:dict):
with tempfile.NamedTemporaryFile(suffix=".json.tmp", delete=False,mode='w') as temp_file:
json.dump(geojson,temp_file.file)
return DelayedVector(temp_file.name)
@staticmethod
def _is_shapefile(path: str) -> bool:
return DelayedVector._filename(path).endswith(".shp")
@staticmethod
def _filename(path: str) -> str:
return urlparse(path).path.split("/")[-1]
def _download_shapefile(self, shp_url: str) -> str:
if self._downloaded_shapefile:
return self._downloaded_shapefile
def expiring_download_directory():
now = datetime.now()
now_hourly_truncated = now - timedelta(minutes=now.minute, seconds=now.second, microseconds=now.microsecond)
hourly_id = hash(shp_url + str(now_hourly_truncated))
return "/data/projects/OpenEO/download_%s" % hourly_id
def save_as(src_url: str, dest_path: str):
with open(dest_path, 'wb') as f:
f.write(requests.get(src_url).content)
download_directory = expiring_download_directory()
shp_file = download_directory + "/" + DelayedVector._filename(shp_url)
try:
os.mkdir(download_directory)
shx_file = shp_file.replace(".shp", ".shx")
dbf_file = shp_file.replace(".shp", ".dbf")
prj_file = shp_file.replace(".shp", ".prj")
shx_url = shp_url.replace(".shp", ".shx")
dbf_url = shp_url.replace(".shp", ".dbf")
prj_url = shp_url.replace(".shp", ".prj")
save_as(shp_url, shp_file)
save_as(shx_url, shx_file)
save_as(dbf_url, dbf_file)
save_as(prj_url, prj_file)
except FileExistsError:
pass
self._downloaded_shapefile = shp_file
return self._downloaded_shapefile
@staticmethod
def _read_shapefile_geometries(shp_path: str) -> List[BaseGeometry]:
# FIXME: returned as a list for safety but possible to return as an iterable?
with fiona.open(shp_path) as collection:
return [shape(record['geometry']) for record in collection]
@staticmethod
def _read_shapefile_bounds(shp_path: str) -> List[BaseGeometry]:
with fiona.open(shp_path) as collection:
return collection.bounds
@staticmethod
def _read_shapefile_crs(shp_path: str) -> pyproj.CRS:
"""
@param shp_path:
@return: CRS as a proj4 dict
"""
with fiona.open(shp_path) as collection:
return collection.crs
@staticmethod
def _as_geometry_collection(feature_collection: Dict) -> Dict:
# TODO #71 #114 Deprecate/avoid usage of GeometryCollection
geometries = (feature['geometry'] for feature in feature_collection['features'])
return {
'type': 'GeometryCollection',
'geometries': geometries
}
@staticmethod
def _read_geojson_geometries(geojson: Dict) -> Iterable[BaseGeometry]:
if geojson['type'] == 'FeatureCollection':
geojson = DelayedVector._as_geometry_collection(geojson)
if geojson['type'] == 'GeometryCollection':
geometries = (shape(geometry) for geometry in geojson['geometries'])
else:
geometry = shape(geojson)
geometries = [geometry]
return geometries
@staticmethod
def _read_geojson_bounds(geojson: Dict) -> (float, float, float, float):
if geojson['type'] == 'FeatureCollection':
bounds = gpd.GeoSeries(shape(f["geometry"]) for f in geojson["features"]).total_bounds
elif geojson['type'] == 'GeometryCollection':
bounds = gpd.GeoSeries(shape(g) for g in geojson['geometries']).total_bounds
else:
geometry = shape(geojson)
bounds = geometry.bounds
return tuple(bounds)
@staticmethod
def _read_geojson_crs(geojson: Dict) -> pyproj.CRS:
#so actually geojson has no crs, it's always lat lon, need to check what gdal does...
crs = geojson.get('crs', {}).get("properties", {}).get("name")
# TODO: what's the deal with this deprecated "init"?
if crs == None:
return pyproj.CRS({'init': 'epsg:4326'})
elif crs.startswith("urn:ogc:"):
return pyproj.CRS(crs)
else:
return pyproj.CRS({'init': crs})