Intersect ATKIS-DE Gruen with LBSM data for Germany¶

The task in this notebook is: Intersect 50 Million LBSM Coordinates with 5 Million ATKIS-Shapes (Extent: Germany)

This sort of intersection is simple, but difficult to execute in common GIS Software due to the size of inputs. We can do this in Python, but simple raycasting will require estimated 3 weeks on a portable computer. Therefore, this is mainly a task for code improvements and better data handling. The final intersection demonstrated in this notebook, optimized using spatial RTrees and Hashsets, will finish in under 1 hour. Inbetween, we'll use some interactive visualisations with Geoviews to stream data, as a demonstration for fast preview of really large files.

1a. Load ATKIS Data¶

First, lets have a look at the data. It is pretty big so we only stream the first couple of features:

# read the shapefiles 
import fiona
import geoviews as gv
import geoviews.feature as gf
from pathlib import Path
from shapely.geometry import shape
import geopandas as gp
from shapely.geometry import Point
import holoviews as hv
from geopandas.tools import sjoin
# enable shapely.speedups which makes some of the spatial queries running faster.
import shapely.speedups
from cartopy import crs as ccrs
hv.notebook_extension('bokeh')
shapely.speedups.enable()

# filepath
atkis_data = Path.cwd() / "01_ATKIS_Separate" / "gruenland.shp"
atkis_shapes = fiona.open(atkis_data)

x = 0
poly_shapes = []
for feature in atkis_shapes:
    x += 1
    #print(feature)
    poly_shapes.append(shape(feature['geometry']))
    first_feature = feature
    if x > 50:
        break
# prepare geoviews layer
gv_shapes = gv.Polygons(
    poly_shapes, label=f'ATKIS-Gruen')

hv.Overlay(
    gv.tile_sources.EsriImagery * \
    gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover'])
    ).opts(
        width=800,
        height=480
    )

Load full ATKIS data per category¶

With Geopandas and loading the full shapefile for Catgeory Gruenflaeche:

%%time
atkis_gruen = gp.read_file(atkis_data)

Wall time: 2min 2s

atkis_gruen.head()
atkis_gruen.shape

(1517149, 14)

Get preview map

gv_shapes = gv.Polygons(
    atkis_gruen.head(50), vdims=['INDIKATOR', 'CellCode50', 'INDIKATOR_', 'Shape_Area']).opts(tools=['hover'])

# add testpoint
testpoint = Point(10.095,47.4)
gv_testpoint = gv.Points([testpoint]).opts(tools=['hover'], size=5, color='red')

hv.Overlay(
    gv.tile_sources.EsriImagery * \
    gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover']) * \
    gv_testpoint).opts(
        width=800,
        height=480
    )

atkis_gruen.head(50).total_bounds

array([10.08314501, 47.39306148, 10.1382516 , 47.40404001])

1b. Load LBSM Data¶

First, lets have a look at the data. It is pretty big so we only stream the first couple of features:

def get_point(line_string):
    """Gets shapely Point from CSV linestring"""
    line_arr = line_string.split(',', 4)
    if len(line_arr) == 0:
        raise ValueError(f"Problem with line: {line_string}")
    lng = float(line_arr[3])
    lat = float(line_arr[2])
    s_point = Point(lng, lat)
    return s_point

lbsm_data = Path.cwd() / "02_LBSM_DE" / "2019-02-13_Germany.csv"

lbsn_points = []
x = 0
with open(lbsm_data, "r", encoding="utf-8") as f_handle:
    headerline = next(f_handle)
    for line in f_handle:
        x += 1
        s_point = get_point(line)
        lbsn_points.append(s_point)
        if x > 10000:
            break
            
# make a geoseries form shapely points
g_points = gp.GeoSeries(lbsn_points)

Preview:

gv_points = gv.Points(
    lbsn_points, label=f'LBSM loc')

import holoviews as hv
hv.notebook_extension('bokeh')
hv.Overlay(
    gv.tile_sources.EsriImagery * \
    gv_points.opts(tools=['hover'], size=16)
    ).opts(
        width=800,
        height=480
    )

2a. Test Intersect¶

Now we have our two datasets that we want to intersect:

print(atkis_gruen.shape)
print(g_points.shape)
print(len(lbsn_points))

(1517149, 14)
(10001,)
10001

Construct RTree for the geometry from atkis shapes for faster intersection

%%time
from rtree import index

# Populate R-tree index with bounds of polygons
idx = index.Index()
for pos, poly in enumerate(atkis_gruen.geometry):
    idx.insert(pos, poly.bounds)

Wall time: 4min 1s

We can also do this in shapely

# Query testpoint to see which polygon it is in
# using first Rtree index, then Shapely geometry's within
poly_idx = [i for i in idx.intersection((testpoint.coords[0]))
        if testpoint.within(atkis_gruen.geometry[i])]
for num, idx_num in enumerate(poly_idx, 1):
    testpoly = atkis_gruen.iloc[[idx_num]]

testpoly_l = gv.Polygons(testpoly, vdims=['INDIKATOR', 'CellCode50', 'INDIKATOR_', 'Shape_Area']).opts(tools=['hover'])

hv.Overlay(
    gv.tile_sources.EsriImagery * \
    gv_shapes.opts(line_color='white', line_width=0.5, fill_alpha=0.5, tools=['hover']) * \
    testpoly_l * \
    gv_testpoint
    
    ).opts(
        width=800,
        height=480
    )

2b. Actual Data Intersect¶

Now we can loop though all points and intersect. Despite RTree, this will still take a while. Lets test with a small number first:

len(lbsn_points)

10001

%%time
poly_list = []
point_sel_list = []
for point in lbsn_points:
    poly_idx = [i for i in idx.intersection((point.coords[0]))
            if point.within(atkis_gruen.geometry[i])]
    if len(poly_idx) > 0:
        point_sel_list.append(point)
    for num, idx_num in enumerate(poly_idx, 1):
        #print(atkis_gruen.iloc[[idx_num]])
        print(atkis_gruen.iloc[[idx_num]].INDIKATOR_)
        #break
        poly_list.append(atkis_gruen.geometry[idx_num])
    if poly_list:
        break
        #poly_list.append(atkis_gruen.iloc[[idx_num]].geometry)
    #break

794985    Gruenland
Name: INDIKATOR_, dtype: object
Wall time: 13 ms

gv_points_sel = gv.Points(
    point_sel_list, label=f'LBSM loc')

testpolylist_l = gv.Polygons(poly_list)

import holoviews as hv
hv.notebook_extension('bokeh')
hv.Overlay(
    gv.tile_sources.EsriImagery * \
    testpolylist_l.opts(line_color='white', line_width=0.5, fill_alpha=0.5) * \
    gv_points_sel.opts(tools=['hover'], color='red', size=16)
    ).opts(
        width=800,
        height=480
    )

3. Intersect Full Data¶

For intersecting all points, we will integrate a few other optimizations and outputs:

store a hash of distinct lat/lng that have already been intersected
also intersect with BBSR Gemeindetypen Shape and append info to output
remove lat/lng from original CSV lines, write data to new CSV per Grün-Type
add information regarding locals/tourists from userdata by distance query

3a. Get User Origin¶

User origin is available for about 40% of the Flickr users in our dataset. It is publicly provided on the profile of FLickr users and was geocoded using the bing maps API. We can calculate the distance from photo location to the users homelocation: below 50km, the user is counted as a "local", above it is likely a "tourist".

We'll first load the geocode data:

%%time
from collections import namedtuple
geocode_data = Path.cwd() / "userorigin" / "u_geocode.csv"
Origin = namedtuple('Origin', 'lat lng')
user_geocode_dict = dict()
with open(geocode_data, "r", encoding="utf-8") as f_handle:
    for line in f_handle:
        line_arr = line.split(",")
        lat = line_arr[0]
        lng = line_arr[1]
        # strip linebreak:
        user_guid = line_arr[3].strip()
        user_geocode_dict[user_guid] = Origin(float(lat), float(lng))
print(len(user_geocode_dict))

434941
Wall time: 1.18 s

for key, value in user_geocode_dict.items():
    print(f'{key} {value}')
    break

2a247bc790f446dfb1abc643352c07af
 Origin(lat='42.3915416', lng='-82.2168868')

Sample User Origin¶

Lets have a look at an example. Get the first photo for whose creator we have an origin:

%%time
def get_arr(line_string):
    """Gets arrayfrom CSV linestring"""
    line_arr = line_string.split(',', 5)
    if len(line_arr) == 0:
        raise ValueError(f"Problem with line: {line_string}")
    return line_arr

lbsn_records = []
#x = 0
y = 0
with open(lbsm_data, "r", encoding="utf-8") as f_handle:
    headerline = next(f_handle)
    for line in f_handle:
        line_arr = get_arr(line)
        origin = line_arr[0]
        user_guid = line_arr[4]
        y += 1
        if origin == '2' and user_guid in user_geocode_dict:
            photo_sample_line = line
            photo_sample_user = user_guid
            break
            #x += 1
            #lbsn_records.append(line)
        if y > 1000000:
            break


##for record in lbsn_records:
##    print(f'{record}\n')

Wall time: 15.6 ms

from math import asin, cos, radians, sin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(
        radians, [lon1, lat1, lon2, lat2])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a_value = (sin(dlat/2)**2 + cos(lat1) *
               cos(lat2) * sin(dlon/2)**2)
    c_value = 2 * asin(sqrt(a_value))
    # Radius of earth in kilometers is 6371
    km_dist = 6371 * c_value
    m_dist = km_dist*1000
    return m_dist

photo_location = get_point(photo_sample_line)
user_location = user_geocode_dict.get(photo_sample_user)
distance = haversine(photo_location.x, photo_location.y, user_location.lng, user_location.lat)
print(f'{distance/1000} km')

6538.467733033563 km

import pyproj
def get_circle_path(start, end, sampling=1000):
    sx, sy = start
    ex, ey = end
    g = pyproj.Geod(ellps='WGS84')
    (az12, az21, dist) = g.inv(sx, sy, ex, ey)
    lonlats = g.npts(sx, sy, ex, ey,
                     1 + int(dist / sampling))
    return lonlats

circle_points = get_circle_path((float(photo_location.x), float(photo_location.y)),(float(user_location.lng), float(user_location.lat)))
user_line = [[float(photo_location.x), float(photo_location.y)],
            [float(user_location.lng), float(user_location.lat)]]
#line_path = gv.Path([user_line])
geo_path = gv.Path([circle_points])

hv.Overlay(
    gv.tile_sources.EsriImagery * \
    gv.Points(
        [photo_location], label=f'Photo Location').opts(tools=['hover'], color='red', size=16) * \
    gv.Points(
        [Point(user_location.lng,user_location.lat)], label=f'User Origin').opts(tools=['hover'], color='blue', size=16) * \
    geo_path).opts(
        width=800,
        height=480
    )

Now we can write the classify method:

def get_dist(photo_location, user_location):
    """Returns haversine distance in km for two points"""
    distance = haversine(photo_location.x, photo_location.y, user_location.lng, user_location.lat)
    distance_km = distance/1000
    return int(distance_km)

3b. Get BBSR Gemeindetypen¶

Additional info should be appended based on BBSR Gemeindetypen Shape

%%time
from rtree import index

bbsr_typen = Path.cwd() / "bbsr_gemeindetypen" / "bbsrgem_projWGS1984.shp"
bbsr_shapes = gp.read_file(bbsr_typen, encoding='utf-8')

# Populate R-tree index with bounds of polygons
idx_bbsr = index.Index()
for pos, poly in enumerate(bbsr_shapes.geometry):
    idx_bbsr.insert(pos, poly.bounds)
    
def get_bbsr_type(loc_point):
    poly_idx = [i for i in idx_bbsr.intersection((loc_point.coords[0]))
            if loc_point.within(bbsr_shapes.geometry[i])]
    if not poly_idx:
        with open("debug.log", "a+") as debug_handle:
            debug_handle.write(f"No Intersection found! {loc_point}\n")
        #raise ValueError(f"No Intersection found! {loc_point}")
        return ""
    for num, idx_num in enumerate(poly_idx, 1):
        # get type from column
        # print(bbsr_shapes.iloc[[idx_num]])
        # get first value from pandas series, e.g. 'Landgemeinde'
        return bbsr_shapes.iloc[[idx_num]].sgtyp_name.values[0]

Wall time: 5.42 s

Test:

# Query testpoint to see which polygon it is in
# using first Rtree index, then Shapely geometry's within
poly_idx = [i for i in idx_bbsr.intersection((testpoint.coords[0]))
            if testpoint.within(bbsr_shapes.geometry[i])]
for num, idx_num in enumerate(poly_idx, 1):
    testpoly = bbsr_shapes.iloc[[idx_num]]

testpoly_l = gv.Polygons(testpoly, vdims=['sgtyp', 'sgtyp_name', 'sgtypd', 'sgtypd_nam']).opts(tools=['hover'])

hv.Overlay(
    gv.tile_sources.EsriImagery * \
    testpoly_l.opts(line_color='white', line_width=0.5, fill_alpha=0.5) * \
    gv_testpoint.opts(size=10)
    ).opts(
        width=800,
        height=480
    )

3c. Intersect & Write to CSV¶

Intersect all LBSM data with ATKIS Shapes and BBSR, append new info, remove lat/lng data, write to CSV.

Following Categories exist:

ATKIS Cat	Shape_Count	ATKIS Cat	Shape_Count	ATKIS Cat	Shape_Count	ATKIS Cat	Shape_Count
gruenland	1517149	sportfreizeiterholung	85408	weinbau	21373	heide	9608
ackerland	895990	streuobst	71986	obstbau	21256	sonstsiedlungsfreifl	9105
laubholz	565031	parkgruenanlage	39069	sonstlandwirts	20640	golfplatz	3055
nadelholz	531657	friedhof	34697	sumpf	18619
gehoelz	485021	kleingarten	34426	wochenendferienhaus	18356
mischholz	459550	moor	21789	gartenland	13096

Preparation:

%%time
from rtree import index

def get_arr(line_string):
    """Gets arrayfrom CSV linestring"""
    line_arr = line_string.split(',', 7)
    if len(line_arr) == 0:
        raise ValueError(f"Problem with line: {line_string}")
    return line_arr

def r_within(original_shape, shape_index, loc_point):
    # Query testpoint to see which polygon it is in
    # using first Rtree index, then Shapely geometry's within
    for i in shape_index.intersection((loc_point.coords[0])):
        if loc_point.within(original_shape.geometry[i]):
            # if at least one intersection
            return True        

def perc_itsc(x, x_isc):
    if x == 0 or x_isc == 0:
        return 0
    return int(x_isc/(x/100))

locations_intersect = set()
locations_nointersect = set()
processed_guids = set()
guids_path = Path.cwd() / "all_intersected_guids.csv"
if guids_path.exists():
    with open(guids_path, 'r', encoding='utf8') as read_guids:
        for line in read_guids:
            processed_guids.add(line.rstrip())
    print(f'Loaded {len(processed_guids)} that have already been processed.')

# add some location that we want to always exclude (e.g. low Geoacuracy)
locations_nointersect.add(f'{10.3822031}::{51.2024651}') # Deutschland
locations_nointersect.add(f'{8.448899}::{49.362361}') # Deutschland / Germany

lbsm_data = Path.cwd() / "02_LBSM_DE" / "2019-02-13_Germany.csv"

Wall time: 1.69 s

Select ATKIS category to process and (re-)create RTree:

%%time
# select cat
atkis_cat = 'gruenland'
output_path = Path.cwd() / "03_Output_LBSM" / f"Germany_LBSM_{atkis_cat}.csv"
atkis_data = Path.cwd() / "01_ATKIS_Separate" / f"{atkis_cat}.shp"
atkis_shapes = gp.read_file(atkis_data, encoding='utf-8')

# Populate R-tree index with bounds of atkis polygons
idx = index.Index()
for pos, poly in enumerate(atkis_shapes.geometry):
    idx.insert(pos, poly.bounds)

Loop through all LBSM records: Processing of 1 Million records takes about 37 Seconds!

Notice that we do not use CSV.reader() - given the structure of our data, this will improve speed because we only get those columns that are necessary for filtering.

%%time
from IPython.display import clear_output
processed_guids.clear()
x = 0
x_pos = 0
with open(lbsm_data, "r", encoding="utf-8") as read_handle, \
     open(output_path, 'w', encoding='utf8') as write_handle, \
     open(guids_path, 'a+', encoding='utf8') as write_guids:
    headerline = next(read_handle)
    #print(headerline)
    new_headerline = f'origin_id,post_guid,origin_dist,atkis_cat,gemeinde_typ,post_time,{get_arr(headerline)[7]}'
    write_handle.write(f'{new_headerline}')
    #print(new_headerline)
    for line in read_handle:
        # update status
        msg_text = f'({atkis_cat.upper()}) Processed records: {x} Intersection found: {perc_itsc(x, x_pos)}% ({x_pos})'
        if x % 100000 == 0:
            clear_output(wait=True)
            print(msg_text)
        #if x > 1000000:
            #break
        x += 1
        # get first few columns
        line_arr = get_arr(line.rstrip())
        post_guid = line_arr[1]
        # skip already processed guids
        if post_guid in processed_guids:
            continue
        
        # assign columns
        origin = line_arr[0]
        user_guid = line_arr[4]
        lng = float(line_arr[3])
        lat = float(line_arr[2])
        loc_id = f'{lat}::{lng}'
        
        # test for intersection
        if not loc_id in locations_intersect:
            if loc_id in locations_nointersect:
                continue
            if not r_within(atkis_shapes, idx, Point(lng, lat)):
                locations_nointersect.add(loc_id)
                continue
            locations_intersect.add(loc_id)
        # positive intersection, update counters
        processed_guids.add(post_guid)
        write_guids.write(f'{post_guid}\n')
        x_pos += 1
        
        # merge post create & upload time
        create_time = line_arr[5]
        upload_time = line_arr[6]
        post_time = ""
        if create_time:
            post_time = create_time
        elif upload_time:
            post_time = upload_time
            
        # get bbsr gemeindetyp
        gemeinde_typ = get_bbsr_type(Point(lng, lat))
        
        # get origin / location dist
        distance = ""
        if origin == '2' and user_guid in user_geocode_dict:
            distance = get_dist(Point(lng, lat), user_geocode_dict.get(user_guid))
            
        # write line
        write_handle.write(f'{origin},{post_guid},{distance},{atkis_cat},"{gemeinde_typ}",{post_time},{line_arr[7]}\n')
        
# final status
clear_output(wait=True)
print(msg_text)

(GRUENLAND) Processed records: 34100000 Intersection found: 2% (803695)
(GRUENLAND) Processed records: 34166789 Intersection found: 2% (806330)
Wall time: 37min 25s