The task in this notebook is: Intersect 50 Million LBSM Coordinates with 5 Million ATKIS-Shapes (Extent: Germany)
This sort of intersection is simple, but difficult to execute in common GIS Software due to the size of inputs. We can do this in Python, but simple raycasting will require estimated 3 weeks on a portable computer. Therefore, this is mainly a task for code improvements and better data handling. The final intersection demonstrated in this notebook, optimized using spatial RTrees and Hashsets, will finish in under 1 hour. Inbetween, we'll use some interactive visualisations with Geoviews to stream data, as a demonstration for fast preview of really large files.
First, lets have a look at the data. It is pretty big so we only stream the first couple of features:
# read the shapefiles
import fiona
import geoviews as gv
import geoviews.feature as gf
from pathlib import Path
from shapely.geometry import shape
import geopandas as gp
from shapely.geometry import Point
import holoviews as hv
from geopandas.tools import sjoin
# enable shapely.speedups which makes some of the spatial queries running faster.
import shapely.speedups
from cartopy import crs as ccrs
hv.notebook_extension('bokeh')
shapely.speedups.enable()
# filepath
atkis_data = Path.cwd() / "01_ATKIS_Separate" / "gruenland.shp"
atkis_shapes = fiona.open(atkis_data)
x = 0
poly_shapes = []
for feature in atkis_shapes:
x += 1
#print(feature)
poly_shapes.append(shape(feature['geometry']))
first_feature = feature
if x > 50:
break
# prepare geoviews layer
gv_shapes = gv.Polygons(
poly_shapes, label=f'ATKIS-Gruen')
hv.Overlay(
gv.tile_sources.EsriImagery * \
gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover'])
).opts(
width=800,
height=480
)
With Geopandas and loading the full shapefile for Catgeory Gruenflaeche:
%%time
atkis_gruen = gp.read_file(atkis_data)
atkis_gruen.head()
atkis_gruen.shape
Get preview map
gv_shapes = gv.Polygons(
atkis_gruen.head(50), vdims=['INDIKATOR', 'CellCode50', 'INDIKATOR_', 'Shape_Area']).opts(tools=['hover'])
# add testpoint
testpoint = Point(10.095,47.4)
gv_testpoint = gv.Points([testpoint]).opts(tools=['hover'], size=5, color='red')
hv.Overlay(
gv.tile_sources.EsriImagery * \
gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover']) * \
gv_testpoint).opts(
width=800,
height=480
)
atkis_gruen.head(50).total_bounds
First, lets have a look at the data. It is pretty big so we only stream the first couple of features:
def get_point(line_string):
"""Gets shapely Point from CSV linestring"""
line_arr = line_string.split(',', 4)
if len(line_arr) == 0:
raise ValueError(f"Problem with line: {line_string}")
lng = float(line_arr[3])
lat = float(line_arr[2])
s_point = Point(lng, lat)
return s_point
lbsm_data = Path.cwd() / "02_LBSM_DE" / "2019-02-13_Germany.csv"
lbsn_points = []
x = 0
with open(lbsm_data, "r", encoding="utf-8") as f_handle:
headerline = next(f_handle)
for line in f_handle:
x += 1
s_point = get_point(line)
lbsn_points.append(s_point)
if x > 10000:
break
# make a geoseries form shapely points
g_points = gp.GeoSeries(lbsn_points)
Preview:
gv_points = gv.Points(
lbsn_points, label=f'LBSM loc')
import holoviews as hv
hv.notebook_extension('bokeh')
hv.Overlay(
gv.tile_sources.EsriImagery * \
gv_points.opts(tools=['hover'], size=16)
).opts(
width=800,
height=480
)
Now we have our two datasets that we want to intersect:
print(atkis_gruen.shape)
print(g_points.shape)
print(len(lbsn_points))
Construct RTree for the geometry from atkis shapes for faster intersection
%%time
from rtree import index
# Populate R-tree index with bounds of polygons
idx = index.Index()
for pos, poly in enumerate(atkis_gruen.geometry):
idx.insert(pos, poly.bounds)
We can also do this in shapely
# Query testpoint to see which polygon it is in
# using first Rtree index, then Shapely geometry's within
poly_idx = [i for i in idx.intersection((testpoint.coords[0]))
if testpoint.within(atkis_gruen.geometry[i])]
for num, idx_num in enumerate(poly_idx, 1):
testpoly = atkis_gruen.iloc[[idx_num]]
testpoly_l = gv.Polygons(testpoly, vdims=['INDIKATOR', 'CellCode50', 'INDIKATOR_', 'Shape_Area']).opts(tools=['hover'])
hv.Overlay(
gv.tile_sources.EsriImagery * \
gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover']) * \
testpoly_l * \
gv_testpoint
).opts(
width=800,
height=480
)
Now we can loop though all points and intersect. Despite RTree, this will still take a while. Lets test with a small number first:
len(lbsn_points)
%%time
poly_list = []
point_sel_list = []
for point in lbsn_points:
poly_idx = [i for i in idx.intersection((point.coords[0]))
if point.within(atkis_gruen.geometry[i])]
if len(poly_idx) > 0:
point_sel_list.append(point)
for num, idx_num in enumerate(poly_idx, 1):
#print(atkis_gruen.iloc[[idx_num]])
print(atkis_gruen.iloc[[idx_num]].INDIKATOR_)
#break
poly_list.append(atkis_gruen.geometry[idx_num])
if poly_list:
break
#poly_list.append(atkis_gruen.iloc[[idx_num]].geometry)
#break
gv_points_sel = gv.Points(
point_sel_list, label=f'LBSM loc')
testpolylist_l = gv.Polygons(poly_list)
import holoviews as hv
hv.notebook_extension('bokeh')
hv.Overlay(
gv.tile_sources.EsriImagery * \
testpolylist_l.opts(line_color='white', line_width=0.5, fill_alpha=0.5) * \
gv_points_sel.opts(tools=['hover'], color='red', size=16)
).opts(
width=800,
height=480
)
For intersecting all points, we will integrate a few other optimizations and outputs:
User origin is available for about 40% of the Flickr users in our dataset. It is publicly provided on the profile of FLickr users and was geocoded using the bing maps API. We can calculate the distance from photo location to the users homelocation: below 50km, the user is counted as a "local", above it is likely a "tourist".
We'll first load the geocode data:
%%time
from collections import namedtuple
geocode_data = Path.cwd() / "userorigin" / "u_geocode.csv"
Origin = namedtuple('Origin', 'lat lng')
user_geocode_dict = dict()
with open(geocode_data, "r", encoding="utf-8") as f_handle:
for line in f_handle:
line_arr = line.split(",")
lat = line_arr[0]
lng = line_arr[1]
# strip linebreak:
user_guid = line_arr[3].strip()
user_geocode_dict[user_guid] = Origin(float(lat), float(lng))
print(len(user_geocode_dict))
for key, value in user_geocode_dict.items():
print(f'{key} {value}')
break
Lets have a look at an example. Get the first photo for whose creator we have an origin:
%%time
def get_arr(line_string):
"""Gets arrayfrom CSV linestring"""
line_arr = line_string.split(',', 5)
if len(line_arr) == 0:
raise ValueError(f"Problem with line: {line_string}")
return line_arr
lbsn_records = []
#x = 0
y = 0
with open(lbsm_data, "r", encoding="utf-8") as f_handle:
headerline = next(f_handle)
for line in f_handle:
line_arr = get_arr(line)
origin = line_arr[0]
user_guid = line_arr[4]
y += 1
if origin == '2' and user_guid in user_geocode_dict:
photo_sample_line = line
photo_sample_user = user_guid
break
#x += 1
#lbsn_records.append(line)
if y > 1000000:
break
##for record in lbsn_records:
## print(f'{record}\n')
from math import asin, cos, radians, sin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(
radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a_value = (sin(dlat/2)**2 + cos(lat1) *
cos(lat2) * sin(dlon/2)**2)
c_value = 2 * asin(sqrt(a_value))
# Radius of earth in kilometers is 6371
km_dist = 6371 * c_value
m_dist = km_dist*1000
return m_dist
photo_location = get_point(photo_sample_line)
user_location = user_geocode_dict.get(photo_sample_user)
distance = haversine(photo_location.x, photo_location.y, user_location.lng, user_location.lat)
print(f'{distance/1000} km')
import pyproj
def get_circle_path(start, end, sampling=1000):
sx, sy = start
ex, ey = end
g = pyproj.Geod(ellps='WGS84')
(az12, az21, dist) = g.inv(sx, sy, ex, ey)
lonlats = g.npts(sx, sy, ex, ey,
1 + int(dist / sampling))
return lonlats
circle_points = get_circle_path((float(photo_location.x), float(photo_location.y)),(float(user_location.lng), float(user_location.lat)))
user_line = [[float(photo_location.x), float(photo_location.y)],
[float(user_location.lng), float(user_location.lat)]]
#line_path = gv.Path([user_line])
geo_path = gv.Path([circle_points])
hv.Overlay(
gv.tile_sources.EsriImagery * \
gv.Points(
[photo_location], label=f'Photo Location').opts(tools=['hover'], color='red', size=16) * \
gv.Points(
[Point(user_location.lng,user_location.lat)], label=f'User Origin').opts(tools=['hover'], color='blue', size=16) * \
geo_path).opts(
width=800,
height=480
)
Now we can write the classify method:
def get_dist(photo_location, user_location):
"""Returns haversine distance in km for two points"""
distance = haversine(photo_location.x, photo_location.y, user_location.lng, user_location.lat)
distance_km = distance/1000
return int(distance_km)
Additional info should be appended based on BBSR Gemeindetypen Shape
%%time
from rtree import index
bbsr_typen = Path.cwd() / "bbsr_gemeindetypen" / "bbsrgem_projWGS1984.shp"
bbsr_shapes = gp.read_file(bbsr_typen, encoding='utf-8')
# Populate R-tree index with bounds of polygons
idx_bbsr = index.Index()
for pos, poly in enumerate(bbsr_shapes.geometry):
idx_bbsr.insert(pos, poly.bounds)
def get_bbsr_type(loc_point):
poly_idx = [i for i in idx_bbsr.intersection((loc_point.coords[0]))
if loc_point.within(bbsr_shapes.geometry[i])]
if not poly_idx:
with open("debug.log", "a+") as debug_handle:
debug_handle.write(f"No Intersection found! {loc_point}\n")
#raise ValueError(f"No Intersection found! {loc_point}")
return ""
for num, idx_num in enumerate(poly_idx, 1):
# get type from column
# print(bbsr_shapes.iloc[[idx_num]])
# get first value from pandas series, e.g. 'Landgemeinde'
return bbsr_shapes.iloc[[idx_num]].sgtyp_name.values[0]
Test:
# Query testpoint to see which polygon it is in
# using first Rtree index, then Shapely geometry's within
poly_idx = [i for i in idx_bbsr.intersection((testpoint.coords[0]))
if testpoint.within(bbsr_shapes.geometry[i])]
for num, idx_num in enumerate(poly_idx, 1):
testpoly = bbsr_shapes.iloc[[idx_num]]
testpoly_l = gv.Polygons(testpoly, vdims=['sgtyp', 'sgtyp_name', 'sgtypd', 'sgtypd_nam']).opts(tools=['hover'])
hv.Overlay(
gv.tile_sources.EsriImagery * \
testpoly_l.opts(line_color='white', line_width=0.5, fill_alpha=0.5) * \
gv_testpoint.opts(size=10)
).opts(
width=800,
height=480
)
Intersect all LBSM data with ATKIS Shapes and BBSR, append new info, remove lat/lng data, write to CSV.
Following Categories exist:
ATKIS Cat | Shape_Count | ATKIS Cat | Shape_Count | ATKIS Cat | Shape_Count | ATKIS Cat | Shape_Count |
---|---|---|---|---|---|---|---|
gruenland | 1517149 | sportfreizeiterholung | 85408 | weinbau | 21373 | heide | 9608 |
ackerland | 895990 | streuobst | 71986 | obstbau | 21256 | sonstsiedlungsfreifl | 9105 |
laubholz | 565031 | parkgruenanlage | 39069 | sonstlandwirts | 20640 | golfplatz | 3055 |
nadelholz | 531657 | friedhof | 34697 | sumpf | 18619 | ||
gehoelz | 485021 | kleingarten | 34426 | wochenendferienhaus | 18356 | ||
mischholz | 459550 | moor | 21789 | gartenland | 13096 |
Preparation:
%%time
from rtree import index
def get_arr(line_string):
"""Gets arrayfrom CSV linestring"""
line_arr = line_string.split(',', 7)
if len(line_arr) == 0:
raise ValueError(f"Problem with line: {line_string}")
return line_arr
def r_within(original_shape, shape_index, loc_point):
# Query testpoint to see which polygon it is in
# using first Rtree index, then Shapely geometry's within
for i in shape_index.intersection((loc_point.coords[0])):
if loc_point.within(original_shape.geometry[i]):
# if at least one intersection
return True
def perc_itsc(x, x_isc):
if x == 0 or x_isc == 0:
return 0
return int(x_isc/(x/100))
locations_intersect = set()
locations_nointersect = set()
processed_guids = set()
guids_path = Path.cwd() / "all_intersected_guids.csv"
if guids_path.exists():
with open(guids_path, 'r', encoding='utf8') as read_guids:
for line in read_guids:
processed_guids.add(line.rstrip())
print(f'Loaded {len(processed_guids)} that have already been processed.')
# add some location that we want to always exclude (e.g. low Geoacuracy)
locations_nointersect.add(f'{10.3822031}::{51.2024651}') # Deutschland
locations_nointersect.add(f'{8.448899}::{49.362361}') # Deutschland / Germany
lbsm_data = Path.cwd() / "02_LBSM_DE" / "2019-02-13_Germany.csv"
Select ATKIS category to process and (re-)create RTree:
%%time
# select cat
atkis_cat = 'gruenland'
output_path = Path.cwd() / "03_Output_LBSM" / f"Germany_LBSM_{atkis_cat}.csv"
atkis_data = Path.cwd() / "01_ATKIS_Separate" / f"{atkis_cat}.shp"
atkis_shapes = gp.read_file(atkis_data, encoding='utf-8')
# Populate R-tree index with bounds of atkis polygons
idx = index.Index()
for pos, poly in enumerate(atkis_shapes.geometry):
idx.insert(pos, poly.bounds)
Loop through all LBSM records: Processing of 1 Million records takes about 37 Seconds!
Notice that we do not use CSV.reader() - given the structure of our data, this will improve speed because we only get those columns that are necessary for filtering.
%%time
from IPython.display import clear_output
processed_guids.clear()
x = 0
x_pos = 0
with open(lbsm_data, "r", encoding="utf-8") as read_handle, \
open(output_path, 'w', encoding='utf8') as write_handle, \
open(guids_path, 'a+', encoding='utf8') as write_guids:
headerline = next(read_handle)
#print(headerline)
new_headerline = f'origin_id,post_guid,origin_dist,atkis_cat,gemeinde_typ,post_time,{get_arr(headerline)[7]}'
write_handle.write(f'{new_headerline}')
#print(new_headerline)
for line in read_handle:
# update status
msg_text = f'({atkis_cat.upper()}) Processed records: {x} Intersection found: {perc_itsc(x, x_pos)}% ({x_pos})'
if x % 100000 == 0:
clear_output(wait=True)
print(msg_text)
#if x > 1000000:
#break
x += 1
# get first few columns
line_arr = get_arr(line.rstrip())
post_guid = line_arr[1]
# skip already processed guids
if post_guid in processed_guids:
continue
# assign columns
origin = line_arr[0]
user_guid = line_arr[4]
lng = float(line_arr[3])
lat = float(line_arr[2])
loc_id = f'{lat}::{lng}'
# test for intersection
if not loc_id in locations_intersect:
if loc_id in locations_nointersect:
continue
if not r_within(atkis_shapes, idx, Point(lng, lat)):
locations_nointersect.add(loc_id)
continue
locations_intersect.add(loc_id)
# positive intersection, update counters
processed_guids.add(post_guid)
write_guids.write(f'{post_guid}\n')
x_pos += 1
# merge post create & upload time
create_time = line_arr[5]
upload_time = line_arr[6]
post_time = ""
if create_time:
post_time = create_time
elif upload_time:
post_time = upload_time
# get bbsr gemeindetyp
gemeinde_typ = get_bbsr_type(Point(lng, lat))
# get origin / location dist
distance = ""
if origin == '2' and user_guid in user_geocode_dict:
distance = get_dist(Point(lng, lat), user_geocode_dict.get(user_guid))
# write line
write_handle.write(f'{origin},{post_guid},{distance},{atkis_cat},"{gemeinde_typ}",{post_time},{line_arr[7]}\n')
# final status
clear_output(wait=True)
print(msg_text)