Module tagmaps.__main__
Tag Maps Clustering Package
Package to cluster data (locations, tags or emoji) and output shapefile containing Alpha Shapes and statistics.
Package can be executed directly (main) or imported using from tagmaps import TagMaps as TM
Expand source code
# -*- coding: utf-8 -*-
"""
Tag Maps Clustering Package
Package to cluster data (locations, tags or emoji)
and output shapefile containing Alpha Shapes
and statistics.
Package can be executed directly (__main__) or
imported using from tagmaps import TagMaps as TM
"""
from __future__ import absolute_import
# delay evaluation of annotations at runtime (PEP 563)
from __future__ import annotations
__author__ = "Alexander Dunkel"
__license__ = "GNU GPLv3"
import sys
import time
from multiprocessing import freeze_support
from tagmaps.tagmaps_ import TagMaps
from tagmaps.classes.load_data import LoadData
from tagmaps.classes.utils import Utils
from tagmaps.config.config import BaseConfig
def main():
"""Main tag maps method for direct execution of package.
The order of execution is pretty linear:
1. LoadData (cleanup, apply basic filters, stoplists etc.) - /01_Input/
2. PrepareData (global statistics, remove long tail, prepare
cleaned data structure)
3. Optional: user input for cluster distance/item selection
4. Cluster Step: calculate itemized / global clusters
5. Alpha Shapes: for each cluster, calculate spatial shape/ boundary
6. Compile Output: normalize results, add statistics, shapefile
7. Write results: Write Shapefile to file - /02_Output/
"""
# init main procedure settings
Utils.init_main()
# initialize config from args
cfg = BaseConfig()
print('\n')
# get logger from config
log = cfg.log
log.info(
"########## "
"STEP 1 of 6: Data Cleanup "
"##########")
input_data = LoadData(
cfg, user_variety_input=True, console_reporting=True)
# initialize tag maps
tagmaps = TagMaps(
tag_cluster=cfg.cluster_tags,
emoji_cluster=cfg.cluster_emoji,
location_cluster=cfg.cluster_locations,
output_folder=cfg.output_folder,
remove_long_tail=cfg.remove_long_tail,
limit_bottom_user_count=cfg.limit_bottom_user_count,
topic_modeling=cfg.topic_modeling,
local_saturation_check=cfg.local_saturation_check,
max_items=cfg.max_items,
logging_level=cfg.logging_level,
mapnik_export=cfg.mapnik_export,)
if cfg.load_from_intermediate or input_data.is_intermediate():
# load data from intermediate (already filtered) results
if not cfg.load_from_intermediate:
# if path empty, get first file
filename = next(iter(input_data.filelist))
cfg.write_cleaned_data = False
else:
filename = cfg.load_from_intermediate
tagmaps.load_intermediate(input_path=filename)
else:
# read and process unfiltered input records from csv
with input_data as records:
for record in records:
tagmaps.add_record(record)
# get statistics for
# unfiltered input data
input_data.input_stats_report()
# prepare loaded data for clustering
tagmaps.prepare_data()
# show statistics for ingested data
tagmaps.global_stats_report()
# get current time for monitoring
now = time.time()
if (cfg.cluster_tags or cfg.cluster_emoji):
log.info(
"\n########## "
"STEP 2 of 6: Tag Ranking "
"##########")
# calculate and report item stats
tagmaps.item_stats_report()
if cfg.write_cleaned_data and not cfg.load_from_intermediate:
# write intermediate results
tagmaps.write_cleaned_data()
# write toplists (emoji, location, tags)
tagmaps.write_toplists()
if cfg.topic_modeling:
tagmaps.write_topics()
continue_proc = True
if cfg.statistics_only is False:
# restart time monitoring for
# actual cluster step
now = time.time()
log.info(
"\n########## "
"STEP 3 of 6: Tag & Emoji "
"Location Clustering "
"##########")
# get user input for cluster distances
if not cfg.auto_mode:
# open user interface for optional user input
continue_proc = tagmaps.user_interface()
else:
# if auto mode and cluster cut distance
# provided via args, set for all clusterer
if cfg.cluster_cut_distance:
tagmaps.set_cluster_distance(
cfg.cluster_cut_distance
)
if continue_proc is True:
if cfg.cluster_tags or cfg.cluster_emoji:
tagmaps.cluster_tags()
tagmaps.cluster_emoji()
log.info(
"########## "
"STEP 4 of 6: Generating Alpha Shapes "
"##########")
tagmaps.gen_tagcluster_shapes()
tagmaps.gen_emojicluster_shapes()
log.info(
"########## "
"STEP 5 of 6: Writing Results to Shapefile "
"##########")
tagmaps.write_tagemoji_shapes()
else:
print(f'\nUser abort.')
if cfg.cluster_locations and continue_proc is True:
log.info(
"\n########## "
"STEP 6 of 6: Calculating Overall Location Clusters "
"##########")
tagmaps.cluster_locations()
tagmaps.gen_location_centroids()
tagmaps.write_location_shapes()
# time reporting
later = time.time()
hours, rem = divmod(later-now, 3600)
minutes, seconds = divmod(rem, 60)
# difference = int(later - now)
log.info(f'\nDone.\n{int(hours):0>2} Hours '
f'{int(minutes):0>2} Minutes and '
f'{seconds:05.2f} Seconds passed.')
if not cfg.auto_mode:
input("Press any key to exit...")
sys.exit(0)
if __name__ == "__main__":
freeze_support()
main()
Functions
def main()
-
Main tag maps method for direct execution of package.
The order of execution is pretty linear: 1. LoadData (cleanup, apply basic filters, stoplists etc.) - /01_Input/ 2. PrepareData (global statistics, remove long tail, prepare cleaned data structure) 3. Optional: user input for cluster distance/item selection 4. Cluster Step: calculate itemized / global clusters 5. Alpha Shapes: for each cluster, calculate spatial shape/ boundary 6. Compile Output: normalize results, add statistics, shapefile 7. Write results: Write Shapefile to file - /02_Output/
Expand source code
def main(): """Main tag maps method for direct execution of package. The order of execution is pretty linear: 1. LoadData (cleanup, apply basic filters, stoplists etc.) - /01_Input/ 2. PrepareData (global statistics, remove long tail, prepare cleaned data structure) 3. Optional: user input for cluster distance/item selection 4. Cluster Step: calculate itemized / global clusters 5. Alpha Shapes: for each cluster, calculate spatial shape/ boundary 6. Compile Output: normalize results, add statistics, shapefile 7. Write results: Write Shapefile to file - /02_Output/ """ # init main procedure settings Utils.init_main() # initialize config from args cfg = BaseConfig() print('\n') # get logger from config log = cfg.log log.info( "########## " "STEP 1 of 6: Data Cleanup " "##########") input_data = LoadData( cfg, user_variety_input=True, console_reporting=True) # initialize tag maps tagmaps = TagMaps( tag_cluster=cfg.cluster_tags, emoji_cluster=cfg.cluster_emoji, location_cluster=cfg.cluster_locations, output_folder=cfg.output_folder, remove_long_tail=cfg.remove_long_tail, limit_bottom_user_count=cfg.limit_bottom_user_count, topic_modeling=cfg.topic_modeling, local_saturation_check=cfg.local_saturation_check, max_items=cfg.max_items, logging_level=cfg.logging_level, mapnik_export=cfg.mapnik_export,) if cfg.load_from_intermediate or input_data.is_intermediate(): # load data from intermediate (already filtered) results if not cfg.load_from_intermediate: # if path empty, get first file filename = next(iter(input_data.filelist)) cfg.write_cleaned_data = False else: filename = cfg.load_from_intermediate tagmaps.load_intermediate(input_path=filename) else: # read and process unfiltered input records from csv with input_data as records: for record in records: tagmaps.add_record(record) # get statistics for # unfiltered input data input_data.input_stats_report() # prepare loaded data for clustering tagmaps.prepare_data() # show statistics for ingested data tagmaps.global_stats_report() # get current time for monitoring now = time.time() if (cfg.cluster_tags or cfg.cluster_emoji): log.info( "\n########## " "STEP 2 of 6: Tag Ranking " "##########") # calculate and report item stats tagmaps.item_stats_report() if cfg.write_cleaned_data and not cfg.load_from_intermediate: # write intermediate results tagmaps.write_cleaned_data() # write toplists (emoji, location, tags) tagmaps.write_toplists() if cfg.topic_modeling: tagmaps.write_topics() continue_proc = True if cfg.statistics_only is False: # restart time monitoring for # actual cluster step now = time.time() log.info( "\n########## " "STEP 3 of 6: Tag & Emoji " "Location Clustering " "##########") # get user input for cluster distances if not cfg.auto_mode: # open user interface for optional user input continue_proc = tagmaps.user_interface() else: # if auto mode and cluster cut distance # provided via args, set for all clusterer if cfg.cluster_cut_distance: tagmaps.set_cluster_distance( cfg.cluster_cut_distance ) if continue_proc is True: if cfg.cluster_tags or cfg.cluster_emoji: tagmaps.cluster_tags() tagmaps.cluster_emoji() log.info( "########## " "STEP 4 of 6: Generating Alpha Shapes " "##########") tagmaps.gen_tagcluster_shapes() tagmaps.gen_emojicluster_shapes() log.info( "########## " "STEP 5 of 6: Writing Results to Shapefile " "##########") tagmaps.write_tagemoji_shapes() else: print(f'\nUser abort.') if cfg.cluster_locations and continue_proc is True: log.info( "\n########## " "STEP 6 of 6: Calculating Overall Location Clusters " "##########") tagmaps.cluster_locations() tagmaps.gen_location_centroids() tagmaps.write_location_shapes() # time reporting later = time.time() hours, rem = divmod(later-now, 3600) minutes, seconds = divmod(rem, 60) # difference = int(later - now) log.info(f'\nDone.\n{int(hours):0>2} Hours ' f'{int(minutes):0>2} Minutes and ' f'{seconds:05.2f} Seconds passed.') if not cfg.auto_mode: input("Press any key to exit...") sys.exit(0)