Alexander Dunkel, TU Dresden, Institute of Cartography; Maximilian Hartmann and Ross Purves Universität Zürich (UZH), Geocomputation;
Visualization of TFIDF and Cosine Similarity Values
The values loaded here have been generated outside Jupyter, in a separate process. This notebook only visualizes data.
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
# import all previous chained notebooks
from _05_countries import *
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Data is stored as aggregate HLL data (postcount) for each term.
root = Path.cwd().parents[1] / "00_topic_data"
TERMS_FLICKR_TFIDF = root / "20210202_FLICKR_SUNSET_random_country_tf_idf.csv"
TERMS_FLICKR_COSINE = root / "20211029_FLICKR_SUNSET_random_country_cosine_similarity_binary.csv"
Some statistics for these files:
%%time
data_files = {
"TERMS_FLICKR_TFIDF":TERMS_FLICKR_TFIDF,
"TERMS_FLICKR_COSINE":TERMS_FLICKR_COSINE,
}
tools.display_file_stats(data_files)
Get as pandas dataframe
def load_cosine_df(csv: Path = TERMS_FLICKR_COSINE) -> pd.DataFrame:
"""Load CSV with cosine similarity values per country"""
df = pd.read_csv(csv, encoding='utf-8', skiprows=0, index_col=0)
# Since this is a matrix of similarity values,
# set index = column names and skip first row (header)
df.columns = df.index
return df
df_cos = load_cosine_df()
df_cos.head()
def load_tfidf_df(csv: Path = TERMS_FLICKR_TFIDF) -> pd.DataFrame:
"""Load CSV with TFIDF ranking for country"""
df = pd.read_csv(csv, encoding='utf-8', header=0, index_col=0)
return df
df_tfidf = load_tfidf_df()
df_tfidf.head()
Combine top terms into single column, drop all other columns
cols = [f'TERM_{ix}'for ix in range(1,20)]
df_tfidf['tfidf'] = df_tfidf[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
drop_cols_except(df_tfidf, ['tfidf'])
df_tfidf.head()
def load_country_geom(
ne_path: Path = NE_PATH, ne_uri: str = NE_URI, ne_filename: str = NE_FILENAME,
crs_proj: str = CRS_PROJ, country_col: str = COUNTRY_COL) -> gp.GeoDataFrame:
"""Load country geometry and set SU_A3 column as index"""
world = gp.read_file(
ne_path / ne_filename.replace(".zip", ".shp"))
world = world.to_crs(crs_proj)
columns_keep = ['geometry', country_col, 'ADMIN']
drop_cols_except(world, columns_keep)
world.set_index(country_col, inplace=True)
return world
world = load_country_geom()
world.head()
This GeoDataFrame can be visualized using interactive Holoviews:
gv.Polygons(world, crs=crs.Mollweide())