Alexander Dunkel, TU Dresden, Institute of Cartography; Maximilian Hartmann, Universität Zürich (UZH), Geocomputation
This is the fourth notebook in a series of nine notebooks:
Merge sunset and sunrise chi values to a single map
Using a diverging colormap, we here combine sunset & sunrise positive chi values only, to increase information content per map.
For merging sunset and sunrise chi, the following considerations apply:
Import code from other jupyter notebooks, synced to *.py with jupytext:
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
# import all previous chained notebooks
from _03_chimaps import *
from modules import preparations
Override which metric is used to calculate chi-values. Default is usercount_est
.
# CHI_COLUMN = "postcount_est"
# CHI_COLUMN = "userdays_est"
# CHI_COLUMN = "usercount_est"
display(CHI_COLUMN)
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load sunset and sunrise grid from CSV:
grid_sunset = get_chimap_fromcsv(
csv_expected="flickr_all_est.csv",
csv_observed="flickr_sunset_est.csv",
chi_column=CHI_COLUMN
)
grid_sunrise = get_chimap_fromcsv(
csv_expected="flickr_all_est.csv",
csv_observed="flickr_sunrise_est.csv",
chi_column=CHI_COLUMN
)
Normalize chi_values to 1 to 1000 range for comparison.
def norm_series(
col_series: pd.Series, range_min: Optional[int] = 1, range_max: Optional[int] = 1000) -> np.ndarray:
"""Normalize (interpolate) Series to new range min-max"""
return np.interp(
col_series, (col_series.min(), col_series.max()), (range_min, range_max))
def norm_col_diverging(
df: gp.GeoDataFrame, col_name: Optional[str] = "chi_value",
range_min: Optional[int] = 1, range_max: Optional[int] = 1000,
mask_minus: Optional[pd.Series] = None, mask_plus: Optional[pd.Series] = None):
"""Normalize positive and negative slice of df[col_name] to range (-)min to (-)max"""
if mask_minus is None:
mask_minus = df[col_name] < 0
if mask_plus is None:
mask_plus = df[col_name] > 0
for ix, mask in enumerate([mask_plus, mask_minus]):
col_series = df.loc[mask, col_name]
if ix == 1:
# -1,-1000
df.loc[mask, col_name] = norm_series(
col_series, np.negative(range_max), np.negative(range_min))
continue
df.loc[mask, col_name] = norm_series(
col_series, range_min, range_max)
Apply normalization
for grid in [grid_sunrise, grid_sunset]:
norm_col_diverging(grid)
Validate
print(grid_sunset["chi_value"].max())
print(grid_sunset.loc[grid_sunset["chi_value"] > 0, "chi_value"].min())
print(grid_sunset.loc[grid_sunset["chi_value"] < 0, "chi_value"].max())
print(grid_sunset["chi_value"].min())
Rename cols, so that both grids can be merged:
def grid_rename_cols(grid: gp.GeoDataFrame, colmap: Dict[str, str]):
"""Rename columns in gdf according to column mapping (colmap)"""
grid.rename(columns=colmap, inplace=True)
Merge grids:
def merge_df(df_a: pd.DataFrame, df_b: pd.DataFrame, merge_cols: List[str]) -> pd.DataFrame:
"""Merge two DataFrames based on index, merge only
columns specified in merge_cols. Returns a new DataFrame"""
df_merged = df_a.merge(
df_b[merge_cols],
left_index=True, right_index=True)
return df_merged
Specific syntax to merge grid columns with two topics (e.g. sunset/sunrise)
def merge_grids(
grid_plus: gp.GeoDataFrame, grid_minus: gp.GeoDataFrame,
t_plus: str = "sunset", t_minus: str = "sunrise") -> gp.GeoDataFrame:
"""Merge two GeoDataFrame with two topics t_minus and t_plus,
rename columns on merge"""
colmap_plus = {
'chi_value':f'chi_value_{t_plus}',
'significant':f'significant_{t_plus}',
'usercount_est':f'usercount_est_{t_plus}',
'postcount_est':f'postcount_est_{t_plus}',
'userdays_est':f'userdays_est_{t_plus}'}
colmap_minus = {
'chi_value':f'chi_value_{t_minus}',
'significant':f'significant_{t_minus}',
'usercount_est':f'usercount_est_{t_minus}',
'postcount_est':f'postcount_est_{t_minus}',
'userdays_est':f'userdays_est_{t_minus}'}
# merge sunset & sunrise chi
grid_rename_cols(grid_plus, colmap_plus)
grid_rename_cols(grid_minus, colmap_minus)
merge_cols = [
f'chi_value_{t_minus}', f'significant_{t_minus}',
f'usercount_est_{t_minus}', f'postcount_est_{t_minus}',
f'userdays_est_{t_minus}']
grid = merge_df(grid_plus, grid_minus, merge_cols)
return grid
grid = merge_grids(grid_sunset, grid_sunrise)
Create chi_value and significant column to store merged values
Comparison step, according the rules defined in the introduction.
To later distinguish between chi_values from sunset and sunrise,
use negative values for sunrise and positive for sunset.
The syntax here is quite complex because a lot of values/combinations
need to be solved during aggregation. Consider the result with a grain of salt.
def merge_chi_value(grid: gp.GeoDataFrame, t_minus: str = "sunrise", t_plus: str = "sunset", metric: str = "chi_value", chi_column = CHI_COLUMN):
"""Merge positive chi value (overrepresentation) of two topics to single, new column. Join significance.
Notes: t_minus topic values will be turned to negative values,
for mapping on diverging cmap. The procedure uses boolean indexing.
Grid can be given as a slice of a larger dataframe (e.g. focus on positive values only)
"""
# init columns
grid[metric] = np.nan
grid["significant"] = False
# (1) --
# get positive chi values grid slice (overrepresented)
sel_mask = grid_slice(grid, t_minus, t_plus, metric, positive=True)
gsel = grid[sel_mask]
grid.loc[sel_mask, f'{metric}'] = np.where(
(((gsel[f"significant_{t_minus}"] == False) & (gsel[f"significant_{t_plus}"] == True)) | # or
((gsel[f"significant_{t_minus}"] == gsel[f"significant_{t_plus}"]) & # and
(gsel[f"{metric}_{t_plus}"] > gsel[f"{metric}_{t_minus}"]))),
gsel[f"{metric}_{t_plus}"], # if True
np.negative(gsel[f"{metric}_{t_minus}"]) # if False
)
# (2)--
# special case: over- and underrepresentation present
sel_mask = grid_slice(grid, t_minus, t_plus, metric, both=True)
sel_mask_tplus = (sel_mask) & (grid[f"{metric}_{t_plus}"] > 0)
gsel = grid[sel_mask_tplus]
grid.loc[sel_mask_tplus, f'{metric}'] = gsel[f"{metric}_{t_plus}"]
sel_mask_tminus = (sel_mask) & (grid[f"{metric}_{t_minus}"] > 0)
gsel = grid[sel_mask_tminus]
grid.loc[sel_mask_tminus, f'{metric}'] = np.negative(gsel[f"{metric}_{t_minus}"])
# --
# join significance
mask_t_plus = (grid[f'{metric}'] > 0) & (grid[f"significant_{t_plus}"] == True)
mask_t_minus =(grid[f'{metric}'] < 0) & (grid[f"significant_{t_minus}"] == True)
grid.loc[mask_t_plus, 'significant'] = True
grid.loc[mask_t_minus, 'significant'] = True
# (3)--
# last case: both underrepresented
# join the larger of the two topic's
# underrepresented references to one column,
sel_mask = grid_slice(grid, t_minus, t_plus, metric, negative=True)
# only significant
sel_mask = sel_mask \
& ((grid[f"significant_{t_plus}"] == True) | (grid[f"significant_{t_minus}"] == True))
# only where at least one metric is not null
# (e.g. usercount_est_sunrise or usercount_est_sunset)
sel_mask = sel_mask & ((grid[f"{chi_column}_{t_minus}"] != 0) | (grid[f"{chi_column}_{t_plus}"] != 0))
gsel = grid[sel_mask]
grid.loc[sel_mask, 'underrepresented'] = np.where(
(((gsel[f"significant_{t_minus}"] == False) & (gsel[f"significant_{t_plus}"] == True)) | # or
((gsel[f"significant_{t_minus}"] == gsel[f"significant_{t_plus}"]) & # and
(gsel[f"{metric}_{t_plus}"] > gsel[f"{metric}_{t_minus}"]))),
t_plus, # if True
t_minus # if False
)
def grid_slice(
grid: gp.GeoDataFrame, t_minus: str = "sunrise", t_plus: str = "sunset",
metric: str = "chi_value",
positive: bool = None, negative: bool = None, both: bool = None) -> gp.GeoDataFrame:
"""Return positive or negative grid slices for selected topics"""
if positive:
plus_mask = grid[f"{metric}_{t_plus}"] > 0
minus_mask = grid[f"{metric}_{t_minus}"] > 0
elif negative:
plus_mask = grid[f"{metric}_{t_plus}"] < 0
minus_mask = grid[f"{metric}_{t_minus}"] < 0
elif both:
plus_mask = grid[f"{metric}_{t_plus}"] < 0
minus_mask = grid[f"{metric}_{t_minus}"] > 0
else:
raise ValueError(
"Provide either one of negative, positive, or both")
if both:
sel_mask = ((plus_mask) & (minus_mask)) | (~(plus_mask) & ~(minus_mask))
return sel_mask
sel_mask = (plus_mask) & (minus_mask)
return sel_mask
Merge
merge_chi_value(grid)
Validation: both sunset and sunrise extreme values must be returned, which is -1000 (inverted) chi for sunrise and +1000 chi for sunset.
print(grid["chi_value"].max())
print(grid["chi_value"].min())
grid.drop(['geometry'], axis = 1).sample(50).head(10)
Configure what should be shown on hover
hover_items = {
'"Sunset" Chi Value':'chi_value_sunset',
'"Sunrise" Chi Value':'chi_value_sunrise',
'"Sunset" Chi Significant':'significant_sunset',
'"Sunrise" Chi Significant':'significant_sunrise',
'"Sunset" User Count':'usercount_est_sunset',
'"Sunrise" User Count':'usercount_est_sunrise',
'"Sunset" User Days':'userdays_est_sunset',
'"Sunrise" User Days':'userdays_est_sunrise',
'"Sunset" Post Count':'postcount_est_sunset',
'"Sunrise" Post Count':'postcount_est_sunrise',
'Expected Post Count':'postcount_est_expected',
'Expected User Days':'userdays_est_expected',
'Expected User Count':'usercount_est_expected',
'Underrepresented':'underrepresented'
}
hover_items_chi = {
f'Total {METRIC_NAME_REF[CHI_COLUMN]}':f'{CHI_COLUMN}_expected',
'Chi-value':'chi_value',
'Chi-significant':'significant'
}
hover_items.update(hover_items_chi)
Set plot options for diverging, merged chi maps
kwargs = {
"hover_items":hover_items,
"cmaps_diverging":("OrRd", "Blues"),
"true_negative":False, # all + labels
"notopicdata_color":"#F0F0F0", # grey
"mask_nonsignificant":True # mapped to notopic class
}
Plot map
gv_plot = plot_diverging(
grid, title=(
f'Chi value merged: '
f'Flickr {METRIC_NAME_REF[CHI_COLUMN]} for "Sunrise" (blue) and "Sunset" (red) '
f'(estimated, normalized to 1-1000 range) per {km_size:.0f} km grid, 2007-2018'), **kwargs)
gv_plot