This is the last notebook in a series of four notebooks:
In this notebook, we'll illustrate how to make further analyses based on the published hll benchmark dataset.
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from _03_yfcc_gridagg_hll import *
Load all dependencies at once, as a means to verify that everything required to run this notebook is available.
import geoviews as gv
import holoviews as hv
import geoviews.feature as gf
import pickle
import ipywidgets as widgets
from ipywidgets.embed import embed_minimal_html
from rtree import index
from geopandas.tools import sjoin
from geoviews import opts
from bokeh.models import HoverTool, FixedTicker
from matplotlib_venn import venn3, venn3_circles
Enable bokeh
preparations.init_imports()
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load memory profiler:
%load_ext memory_profiler
Print versions of packages used herein, for purposes of replication:
root_packages = [
'ipywidgets', 'geoviews', 'geopandas', 'holoviews', 'rtree', 'matplotlib-venn', 'xarray']
preparations.package_report(root_packages)
Geoviews and Holoviews can be used to create interactive maps, either insider Jupyter or as externally stored as HTML. The syntax of the plot methods must be slightly adapted from the matplotlib output. Given the advanced features of interactivity, we can also add additional information that is shown e.g. on mouse hover over certain grid cells.
Loading (geodataframe) using pickle. This is the easiest way to load intermediate data, but may be incompatible if package versions change. If loading pickles does not work, a workaround is to load data from CSV and re-create pickle data, which will be compatible with used versions. Have a look at 03_yfcc_gridagg_hll.ipynb
to recreate pickles with current package versions.
Load pickle and merge results of raw and hll dataset:
grid_est = pd.read_pickle(OUTPUT / "pickles" / "yfcc_all_est.pkl")
grid_raw = pd.read_pickle(OUTPUT / "pickles" / "yfcc_all_raw.pkl")
grid = grid_est.merge(
grid_raw[['postcount', 'usercount', 'userdays']],
left_index=True, right_index=True)
Have a look at the numbers for exact and estimated values. Smaller values are exact in both hll and raw because Sparse Mode is used.
grid[grid["usercount_est"]>5].head()
Some updates to the plotting function are necessary, for compatibility with geoviews/holoviews interactive mapping.
def label_nodata_xr(
grid: gp.GeoDataFrame, inverse: bool = None,
metric: str = "postcount_est", scheme: str = None,
label_nonesignificant: bool = None, cmap_name: str = None):
"""Create a classified colormap from pandas dataframe
column with additional No Data-label
Args:
grid: A geopandas geodataframe with metric column to classify
inverse: If True, colors are inverted (dark)
metric: The metric column to classify values
scheme: The classification scheme to use.
label_nonesignificant: If True, adds an additional
color label for none-significant values based
on column "significant"
cmap_name: The colormap to use.
Adapted from:
https://stackoverflow.com/a/58160985/4556479
See available colormaps:
http://holoviews.org/user_guide/Colormaps.html
See available classification schemes:
https://pysal.org/mapclassify/api.html
"""
# get headtail_breaks
# excluding NaN values
grid_nan = grid[metric].replace(0, np.nan)
# get classifier scheme by name
classifier = getattr(mc, scheme)
# some classification schemes (e.g. HeadTailBreaks)
# do not support specifying the number of classes returned
# construct optional kwargs with k == number of classes
optional_kwargs = {"k":9}
if scheme == "HeadTailBreaks":
optional_kwargs = {}
# explicitly set dtype to float,
# otherwise mapclassify will error out due
# to unhashable type numpy ndarray
# from object-column
scheme_breaks = classifier(
grid_nan.dropna().astype(float), **optional_kwargs)
# set breaks category column
# to spare cats:
# increase by 1, 0-cat == NoData/none-significant
spare_cats = 1
grid[f'{metric}_cat'] = scheme_breaks.find_bin(
grid_nan) + spare_cats
# set cat 0 to NaN values
# to render NoData-cat as transparent
grid.loc[grid_nan.isnull(), f'{metric}_cat'] = np.nan
# get label bounds as flat array
bounds = get_label_bounds(
scheme_breaks, grid_nan.dropna().values, flat=True)
cmap_name = cmap_name
cmap = plt.cm.get_cmap(cmap_name, scheme_breaks.k)
# get hex values
cmap_list = [colors.rgb2hex(cmap(i)) for i in range(cmap.N)]
# prepend nodata color
# shown as white in legend
first_color_legend = '#ffffff'
if inverse:
first_color_legend = 'black'
cmap_list = [first_color_legend] + cmap_list
cmap_with_nodata = colors.ListedColormap(cmap_list)
return cmap_with_nodata, bounds, scheme_breaks
Since interactive display of grid-polygons is too slow, we're converting the grid to an xarray object, which is then overlayed as a rastered image. Also added here is the option to show additional columns on hover, which will be later used to compare raw and hll values.
def get_custom_tooltips(items: Dict[str, str]) -> str:
"""Compile HoverTool tooltip formatting with items to show on hover"""
# thousands delimitor formatting
# will be applied to the for the following columns
tdelim_format = [
'usercount_est', 'postcount_est', 'userdays_est',
'usercount', 'postcount', 'userdays']
# in HoloViews, custom tooltip formatting can be
# provided as a list of tuples (name, value)
tooltips=[
# f-strings explanation:
# - k: the item name, v: the item value,
# - @ means: get value from column
# optional formatting is applied using
# `"{,f}" if v in thousand_formats else ""`
# - {,f} means: add comma as thousand delimiter
# only for values in tdelim_format (usercount and postcount)
# else apply automatic format
(k,
f'@{v}{"{,f}" if v.replace("_expected", "") in tdelim_format else ""}'
) for k, v in items.items()
]
return tooltips
def set_active_tool(plot, element):
"""Enable wheel_zoom in bokeh plot by default"""
plot.state.toolbar.active_scroll = plot.state.tools[0]
def convert_gdf_to_gvimage(
grid, metric, cat_count,
additional_items: Dict[str, str] = None) -> gv.Image:
"""Convert GeoDataFrame to gv.Image using categorized
metric column as value dimension
Args:
grid: A geopandas geodataframe with indexes x and y
(projected coordinates) and aggregate metric column
metric: target column for value dimension.
"_cat" will be added to retrieve classified values.
cat_count: number of classes for value dimension
additional_items: a dictionary with optional names
and column references that are included in
gv.Image to provide additional information
(e.g. on hover)
"""
if additional_items is None:
additional_items_list = []
else:
additional_items_list = [
v for v in additional_items.values()]
# convert GeoDataFrame to xarray object
# the first vdim is the value being used
# to visualize classes on the map
# include additional_items (postcount and usercount)
# to show exact information through tooltip
xa_dataset = gv.Dataset(
grid.to_xarray(),
vdims=[
hv.Dimension(
f'{metric}_cat', range=(0, cat_count))]
+ additional_items_list,
crs=crs.Mollweide())
return xa_dataset.to(gv.Image, crs=crs.Mollweide())
def plot_interactive(grid: gp.GeoDataFrame, title: str,
metric: str = "postcount_est", store_html: str = None,
additional_items: Dict[str, str] = {
'Post Count':'postcount',
'User Count':'usercount',
'User Days':'userdays',
'Post Count (estimated)':'postcount_est',
'User Count (estimated)':'usercount_est',
'User Days (estimated)':'userdays_est'},
scheme: str = "HeadTailBreaks",
cmap: str = "OrRd",
inverse: bool = None,
output: Path = OUTPUT):
"""Plot interactive map with holoviews/geoviews renderer
Args:
grid: A geopandas geodataframe with indexes x and y
(projected coordinates) and aggregate metric column
metric: target column for aggregate. Default: postcount.
store_html: Provide a name to store figure as interactive HTML.
title: Title of the map
additional_items: additional items to show on hover
scheme: The classification scheme to use. Default "HeadTailBreaks".
cmap: The colormap to use. Default "OrRd".
inverse: plot other map elements inverse (black)
output: main folder (Path) for storing output
"""
# check if all additional items are available
for key, item in list(additional_items.items()):
if item not in grid.columns:
additional_items.pop(key)
# work on a shallow copy to not modify original dataframe
grid_plot = grid.copy()
# classify metric column
cmap_with_nodata, bounds, headtail_breaks = label_nodata_xr(
grid=grid_plot, inverse=inverse, metric=metric,
scheme=scheme, cmap_name=cmap)
# construct legend labels for colormap
label_dict = {}
for i, s in enumerate(bounds):
label_dict[i+1] = s
label_dict[0] = "No data"
cat_count = len(bounds)
# create gv.image layer from gdf
img_grid = convert_gdf_to_gvimage(
grid=grid_plot,
metric=metric, cat_count=cat_count,
additional_items=additional_items)
# define additional plotting parameters
# width of static jupyter map,
# 360° == 1200px
width = 1200
# height of static jupyter map,
# 360°/2 == 180° == 600px
height = int(width/2)
responsive = False
aspect = None
# if stored as html,
# override values
if store_html:
width = None
height = None
responsive = True
# define width and height as optional parameters
# only used when plotting inside jupyter
optional_kwargs = dict(width=width, height=height)
# compile only values that are not None into kwargs-dict
# by using dict-comprehension
optional_kwargs_unpack = {
k: v for k, v in optional_kwargs.items() if v is not None}
# prepare custom HoverTool
tooltips = get_custom_tooltips(
additional_items)
hover = HoverTool(tooltips=tooltips)
# create image layer
image_layer = img_grid.opts(
color_levels=cat_count,
cmap=cmap_with_nodata,
colorbar=True,
clipping_colors={'NaN': 'transparent'},
colorbar_opts={
# 'formatter': formatter,
'major_label_text_align':'left',
'major_label_overrides': label_dict,
'ticker': FixedTicker(
ticks=list(range(0, len(label_dict)))),
},
tools=[hover],
# optional unpack of width and height
**optional_kwargs_unpack
)
edgecolor = 'black'
bg_color = None
fill_color = '#479AD4'
alpha = 0.1
if inverse:
alpha = 1
bg_color = 'black'
edgecolor = 'white'
# combine layers into single overlay
# and set global plot options
gv_layers = (gf.ocean.opts(alpha=alpha, fill_color=fill_color) * \
image_layer * \
gf.coastline.opts(line_color=edgecolor) * \
gf.borders.opts(line_color=edgecolor)
).opts(
bgcolor=bg_color,
# global_extent=True,
projection=crs.Mollweide(),
responsive=responsive,
data_aspect=1, # maintain fixed aspect ratio during responsive resize
hooks=[set_active_tool],
title=title)
if store_html:
hv.save(gv_layers, output / "html" / f'{store_html}.html', backend='bokeh')
else:
return gv_layers
plot interactive in-line:
gv_plot = plot_interactive(
grid, title=f'YFCC User Count (estimated) per {int(GRID_SIZE_METERS/1000):.0f}km grid', metric="usercount_est",)
gv_plot