In this notebook, we'll explore ways to visualize activity rankings across different types of land use. Land use data is derived from ATKIS Basis-DLM (Selected categories) and intersected with geolocated Social Media Posts (Flickr, Instagram Twitter). From Originally 35 Million Social Media Posts, about 8 Million are found in the subset of chosen categories. This data is the base for the analysis in this notebook. The process for intersecting ATKIS and LBSM-Data is shown here.
First, we start with our imports and get logging established:
# imports needed and set up logging
# import gensim
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import holoviews as hv
import re
from collections import defaultdict
from collections import namedtuple
import csv
from pathlib import Path
import numpy as np
import pandas as pd
hv.extension('bokeh')
The Dataset(s) we will be loading have already been intersected with land-use data - therefore, we can dive straight into analysis, without prior classification. Have a look first:
Post = namedtuple('Post','origin_id post_guid user_guid post_body post_title hashtags emoji, post_time')
data_file = '03_Output_LBSM/Germany_LBSM_weinbau.csv'
def get_post(post_line):
"""Concatenate topic info from post columns"""
origin_guid = post_line.get('origin_id')
post_guid = post_line.get('post_guid')
user_guid = post_line.get('user_guid')
post_title = post_line.get('post_title')
post_body = post_line.get('post_body')
hashtags = post_line.get('tags').split(';')
emoji = post_line.get('emoji').split(';')
post_time_hr = post_line.get('post_time')[:10]
return Post(origin_guid, post_guid, user_guid, post_body, post_title, hashtags, emoji, post_time_hr)
with open(data_file, 'r', encoding="utf-8") as file_handle:
post_reader = csv.DictReader(
file_handle,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
for ix, post in enumerate(post_reader):
print(f'{post}\n')
lbsn_post = get_post(post)
print(f'{lbsn_post}')
break
Now that we've had a sneak peak of our dataset, we can read it into a list so that we can pass this on to the Ranking model. We'll stream files and only process post by post to reduce memory burden.
def scan_local_files():
"""Read Local Files according to config parameters"""
pathname = Path.cwd()
input_path = pathname / '03_Output_LBSM'
filelist = list(input_path.glob(
f'*.csv'))
return filelist
def read_input_file(input_file):
"""Read Input file lines and convert to post"""
logging.info(f"Reading file {os.path.basename(input_file)}..")
with open(input_file, 'r', encoding="utf-8") as file_handle:
post_reader = csv.DictReader(
file_handle,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
for ix, post_line in enumerate(post_reader):
lbsn_post = get_post(post_line)
if (ix % 100000 == 0):
logging.info (f"read {ix} posts")
# do some pre-processing and return a list of words for each review text
yield lbsn_post
First, we'll define our topics. A topic is defined as a list of terms. Note that an "activity" can be defined specific or broad:
As a conclusion, we can say that we want to define our topics as diverse as possible. Some activities or groups of activities might overlap, while other might describe opposide ends of a continuum of possible activity-groups. The goal here is not to be holistic, but to get a cross-section of a selected list of relevant green space activities.
Furthermore:
topics = dict()
topics['hiking'] = ('hike', 'hiking', 'wandern', 'wanderung', 'wanderer', 'wanderweg', 'wanderroute', '🥾') # optional: 🚶 (person walking)
# biking, this is a very specific atcivity
topics['biking'] = ('bike', 'biking', 'bicycle', 'cycling', 'fahrrad', 'velo', '🚲', '🚴')
# just plain walking
topics['walking'] = ('walk', 'walking', 'spazieren', 'stroll', 'fußweg', 'spazierweg', 'spaziergang') # optional: 🚶 (person walking)
# broad category with a bias towards jogging
topics['sport'] = ('sport', 'jogging', 'running', 'exercise', 'run', 'workout', 'rennen', 'dauerlauf', '🏃')
topics['relaxing'] = ('relaxing', 'sitting', 'relaxation', 'entspannen', 'innehalten', 'erholen', 'ausruhen', 'recreation')
# meeting with friends, this can encompass a group of activities
# note that we use 'meeting'; in green-space land use, this likely hints to meeting with friends, not within work environment
topics['friends'] = ('friends', 'friends', 'meeting', 'socialize', 'freunde', 'treffen', 'hang around', 'abhängen')
# anything related to family and kinder/kids
topics['family'] = ('family', 'familie', 'kinder', 'baby', 'familienausflug', 'familytrip', '👪')
# tourist/sighseeing group
topics['tourist'] = ('tourist', 'sighseeing', 'sehenswürdigkeit', 'excursion', 'exkursion', 'sight-seeing', 'tour', 'travel', 'reise', '🌇')
# very general: spielen/playing
topics['playing'] = ('spielen', 'playing', 'play', 'spiel', 'game', '🎲', '🎮')
# lets add some specific activities: picknick-grillen, soccer ..
topics['picnic'] = ('picnic', 'barbecure', 'picknick', 'picknickkorb', 'grillen', 'grill')
topics['soccer'] = ('soccer', 'fussball', 'fußball', 'football', '⚽')
For selecting posts and counting userdays based on topic-terms, we define the following rules:
%%time
from IPython.display import clear_output
def word_in_text(word, text_value):
"""Checks whether full word is in string"""
if re.search(r'\b' + word + r'\b', text_value, re.IGNORECASE):
return True
def check_topic(topic, lbsn_post):
"""Checks whether topic is in post"""
for term in topic:
if \
term in lbsn_post.hashtags or \
term in lbsn_post.emoji or \
word_in_text(term, lbsn_post.post_title) or \
word_in_text(term, lbsn_post.post_body):
return True
def get_userday(post):
return f'{post.user_guid}{post.post_time}'
# init count structures
total_counts = dict()
total_userdaycounts = defaultdict(set)
cnt_dict = dict()
userday_cnt_dict = dict()
# init dicts for each topic
for activity_name in topics.keys():
# use default dict to init int:
cnt_dict[activity_name] = defaultdict(int)
# use set for counting userdays:
userday_cnt_dict[activity_name] = defaultdict(set)
# perform topic matching
for file_name in scan_local_files():
# get land use type from filename
f_name = os.path.basename(file_name)
if f_name == 'all_intersected_guids.csv':
# skip
continue
# strip leading 'Germany_LBSM_'
type_text = f_name[13:].rstrip('.csv')
total_counts[type_text] = 0
# loop posts
for lbsn_post in read_input_file(file_name):
# count post
total_counts[type_text] += 1
# count userday
userday = get_userday(lbsn_post)
total_userdaycounts[type_text].add(userday)
for activity_name, topic_terms in topics.items():
if check_topic(topic_terms, lbsn_post):
# count post
cnt_dict[activity_name][type_text] += 1
# count userday
userday_cnt_dict[activity_name][type_text].add(userday)
# count distinct userdays
for activity_name in topics.keys():
userday_cnt_dict[activity_name][type_text] = len(userday_cnt_dict[activity_name].get(type_text))
clear_output(wait=True)
selected_cnt = sum([sum(x.values()) for x in cnt_dict.values()])
total_cnt = sum(total_counts.values())
perc_cnt = selected_cnt/(total_cnt/100)
print(
f'Done. Found topic matches in {selected_cnt} posts '
f'of {total_cnt} total posts ({perc_cnt:.2f}%)')
for land_use in total_userdaycounts.keys():
total_userdaycounts[land_use] = len(total_userdaycounts[land_use])
selected_userdays = sum([sum(x.values()) for x in userday_cnt_dict.values()])
total_userdays = sum(total_userdaycounts.values())
perc_userdays = selected_userdays /(total_userdays/100)
print(
f'Done. Found topic matches in {selected_userdays} userdays '
f'of {total_userdays} total userdays ({perc_userdays:.2f}%)')
Convert dict to pandas dataframe for easier handling. We can choose to analyse absolute post counts here (prone to error but fast) or userdays (less prone but slow to calculate)
#df = pd.DataFrame.from_dict(cnt_dict)
df = pd.DataFrame.from_dict(userday_cnt_dict)
# get preview
df.style.background_gradient(cmap='viridis')
# post counts:
#df_total = pd.DataFrame.from_dict(
# total_counts.items())
# user days:
df_total = pd.DataFrame.from_dict(
total_userdaycounts.items())
Optional: store intermediate results (pandas dataframe pickle)
# write:
#df.to_pickle("activity_intermediate_userdays.pkl")
#df_total.to_pickle("activity_total_userdays.pkl")
# load:
df = pd.read_pickle("activity_intermediate_userdays.pkl")
df_total = pd.read_pickle("activity_total_userdays.pkl")
Compare to total post counts:
print('Post count per topic')
df_postcount = pd.DataFrame.from_dict(cnt_dict)
df_postcount.style.background_gradient(cmap='viridis')
These are absolute values with little meaning because some land use types appear more often, similarly, some activities have typica a higher frequency of matches. To normalize these values, we'll therefore calculate absolute percentages first for each landuse category. Afterwards, we can nornmalize (i.e. stretch) results to 0-100 range.
replace index names for display
name_ref = {
'gruenland':'Gruenland',
'ackerland':'Ackerland',
'laubholz':'Laubholz',
'nadelholz':'Nadelholz',
'gehoelz':'Gehoelz',
'mischholz':'Mischholz',
'sportfreizeiterholung':'sonst. Sport-, Freizeit-, Erholungsfl.',
'streuobst':'Streuobst',
'parkgruenanlage':'Park, Gruenanlage',
'friedhof':'Friedhof',
'kleingarten':'Kleingarten',
'moor':'Moor',
'weinbau':'Weinbau',
'obstbau':'Obstbau',
'sonstlandwirt':'sonst. Landwirtschaftsfl.',
'sumpf':'Sumpf',
'wochenendferienhau':'Wochenend-, Ferienhaussiedl.',
'gartenland':'Gartenland',
'heide':'Heide',
'sonstsiedlungsfreifl':'sonstige Siedlungsfreifl.',
'golfplatz':'Golfplatz',
}
df.rename(index=name_ref, inplace=True)
df.index
#for dict_key, value_count in total_counts.items():
# total_counts[name_ref.get(dict_key)] = value_count
# total_counts.pop(dict_key)
df_total.columns = ['Land use', 'Post Count']
df_total = df_total.set_index(['Land use'])
df_total.rename(index=name_ref, inplace=True)
df_total['Percentage'] = df_total['Post Count']/(total_cnt/100)
df_total.style.background_gradient(cmap='summer')
# transpose
df_perc = df.T
# normalize using total counts for each land use cat
for type_text, total_count in total_userdaycounts.items():
type_text = name_ref.get(type_text)
df_perc[type_text] = df_perc[type_text]/(total_count/100)
# transpose again
df_perc = df_perc.T
# show percentages
df_perc.style.background_gradient(cmap='summer')
#df.index
#df.columns
#df.shape
We'll use Holoviews Heatmap to display data
from holoviews import opts
hv.HeatMap({'x': df_perc.columns, 'y': df_perc.index, 'z': df_perc}, ['x', 'y'], 'z'
).opts(opts.HeatMap(tools=['hover'], colorbar=True, width=700, height=400,cmap='greens'))
To improve legebility and colorization, we stretch values for each actitity to 1-100 range. Furthermore, we use log-values to reduce peaks and highlight information in the long tail.
def normalize(df):
result = df.copy()
for feature_name in df.columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
# log scale (reduce peaks) and normalize (0-1 range)
df_norm = normalize(np.log(df_perc))
Calculate Alpha Values (transparency of cells) from total available posts (= accuracy)
# log-scale and normalize between 0.5 and 1 (= final transparency)
df_total['Log-Norm. Percentage'] = np.interp(
np.log(df_total['Percentage']), np.log((df_total['Percentage'].min(), df_total['Percentage'].max())), (0.5, 1))
df_alpha = df_total['Log-Norm. Percentage']
np_alpha = df_alpha.values
np_alpha = np.tile(np_alpha, (len(topics), 1)).transpose()
df_alpha = pd.DataFrame(np_alpha)
df_alpha.index = df.index
from holoviews import dim, opts
from bokeh.models import HoverTool
def hook(plot, element):
# remove axis for plot
plot.handles['xaxis'].visible = False
plot.handles['yaxis'].visible = False
plot.outline_line_color = None
plot.border_fill_color = None
plot.background_fill_color = None
plot.outline_line_width = 0
plot.outline_line_alpha = 0
#plot.axis.visible = False
# explicitly declare hover tool so we can add "%" sign
TOOLTIPS = [
('Activity (LBSM)', '@x'),
('Land Use (ATKIS)', '@y'),
('Relative importance (Log & Norm 0-1)', '@z{1.1111}'),
('Percentage of userdays (abs)', '@z2{1.11}%'),
('Total userdays (abs)', '@z3'),
]
hover = HoverTool(tooltips=TOOLTIPS)
hv.HeatMap({'x': df.columns, 'y': df.index, 'z': df_norm, 'z2': df_perc, 'z3': df, 'z4': df_alpha},
kdims=[('x', 'Activity (LBSM)'), ('y', 'Land Use (ATKIS)')],
vdims=['z', 'z2', 'z3', 'z4'],
).opts(
opts.HeatMap(
title_format="Heatmap for selected ATKIS categories and LBSM activities",
tools=[hover],
colorbar=True,
width=720,
height=520,
cmap='greens'
#alpha='z4' # dim cells based on total available posts (=accuracy)
)
)
# use http://tools.zenverse.net/word-wrap/ for word wrap
Dot Product (Skalarprodukt) can be used to compute vectors between all values of two columns (i.e. activities) or two rows (i.e. land uses). This can be used to compare patterns based on cosine similarity. A cosine similarity of 1 means identical, where 0 means completely different.
from scipy.spatial.distance import cosine
from pandas import DataFrame
print(f'hiking/biking: {1 - cosine(df["hiking"], df["biking"])}')
print(f'walking/soccer: {1 - cosine(df["walking"], df["soccer"])}')
print(f'sport/soccer: {1 - cosine(df["sport"], df["soccer"])}')
print(f'Park, Gruenanlage/Friedhof: {1 - cosine(df.loc["Park, Gruenanlage"], df.loc["Friedhof"])}')
print(f'Golfplatz/Nadelholz: {1 - cosine(df.loc["Golfplatz"], df.loc["Nadelholz"])}')
We can use these similarity scores to re-order the heatmap based on similarity measures. Seaborn, for example, offers clustermap, which allows specifiying different cluster and metrics methods. There are many other ways to created clustered heatmaps (see links below).
import seaborn as sns
heatmap_sns = sns.clustermap(df_norm, metric="correlation", standard_scale=1, method="ward", cmap="Greens")
heatmap_sns.savefig("clusterheatmap_userdays_greens.png")
heatmap_sns.savefig("clusterheatmap_userdays_greens.svg",format="svg")
print(f'rows: {heatmap_sns.dendrogram_row.reordered_ind}')
print(f'columns: {heatmap_sns.dendrogram_col.reordered_ind}')
#columnsTitles = ['onething', 'secondthing', 'otherthing']
# get col and row names by ID
colname_list = [df.columns[col_id] for col_id in heatmap_sns.dendrogram_col.reordered_ind]
rowname_list = [df.index[row_id] for row_id in heatmap_sns.dendrogram_row.reordered_ind]
# change row/col order
df_ro = df.reindex(rowname_list)
df_ro = df_ro[colname_list]
df_norm_ro = df_norm.reindex(rowname_list)
df_norm_ro = df_norm_ro[colname_list]
df_perc_ro = df_perc.reindex(rowname_list)
df_perc_ro = df_perc_ro[colname_list]
print(rowname_list)
print(colname_list)
%%output filename="meingruen_activities_userdays" # uncomment for output to file
heatm = hv.HeatMap({'x': df_ro.columns, 'y': df_ro.index, 'z': df_norm_ro, 'z2': df_perc_ro, 'z3': df_ro, 'z4': df_alpha},
kdims=[('x', 'Activity (LBSM)'), ('y', 'Land Use (ATKIS)')],
vdims=['z', 'z2', 'z3', 'z4'],
).opts(
opts.HeatMap(
title_format="Heatmap for selected ATKIS categories and LBSM activities",
tools=[hover],
colorbar=True,
width=720,
height=520,
cmap='greens'
#alpha='z4' # dim cells based on total available posts (=accuracy)
)
)
heatm + \
hv.Text(x=0.01, y=0.5,
text='Geotagged Social Media posts (Twitter,\n'
'Instagram, Flickr) have first been\n'
'intersected with ATKIS geometries for\n'
'Germany. This heatmap shows the correlation\n'
'between selected activities expressed in\n'
'intersected Social Media posts and the bias\n'
'for certain land use types (ATKIS).\n'
'Dark-green colors mean high correlation (1),\n'
'whereas lighter colors mean low correlation\n'
'(0) between land use and activity. Columns\n'
'(activities) and rows (land use) have been\n'
'ordered using 2-D cosine-similarity\n'
'clustering, with the goal to group cells of\n'
'similar patterns. The base measure here is\n'
'Userdays (see Wood, Guerry, Silver, & Lacayo,\n'
'2013. Each user is counted once per day and\n'
'activity.'
).opts(
height=450, show_frame=False, hooks=[hook], text_align='left', text_font_size='13px')
Export to svg
from bokeh.io import export_svgs
p = hv.render(heatm, backend='bokeh')
p.output_backend = "svg"
export_svgs(p, filename="heatmap_userdays_greens.svg")