â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“, â–“â–“â–“â–“â–“â–“â–“, â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“; â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“, â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“, â–“â–“â–“â–“â–“â–“â–“
Visualizations of temporal patterns for places affected by "mass invasion". The list of 13 places was prepared by Claudia Tautenhahn for her master's thesis.
Tautenhahn, C. Das Phänomen der Masse: Landschaftliche Wirkungen sozialer Medien. Master's thesis. TU Dresden 2020, Department of Landscape Architecture and Environmental Planning
I was the second advisor of this thesis and performed data collection. Based on Claudia's list, 1.5 Million Instagram Posts were queried. The query period covered was 2010
to 2019
, but the lower analysis bound was limited to 2015 due to low data availability in early years. This notebook uses the originally collected data to create visualizations based on privacy-aware HyperLogLog cardinality estimation.
OUTPUT = Path.cwd().parents[0] / "out" # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
OUTPUT.mkdir(exist_ok=True)
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)
%load_ext autoreload
%autoreload 2
Select M
for monthly aggregation, Y
for yearly aggregation
AGG_BASE = "M"
First, define whether to study usercount or postcount
# METRIC = 'user'
METRIC = 'post'
metric_col = 'post_hll'
if METRIC == 'user':
metric_col = 'user_hll'
Set global font
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
color_instagram = '#737EBD'
Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.
%%time
data_files = {
"MASS_INVASION_ALL":MASS_INVASION_ALL,
}
tools.display_file_stats(data_files)
pd.read_csv(MASS_INVASION_ALL, nrows=10)
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
db_host = "127.0.0.1"
db_port = "25432"
db_name = "hlldb"
Connect to empty Postgres database running HLL Extension:
DB_CONN = psycopg2.connect(
host=db_host,
port=db_port,
dbname=db_name,
user=db_user,
password=db_pass
)
DB_CONN.set_session(
readonly=True)
DB_CALC = tools.DbConn(
DB_CONN)
CUR_HLL = DB_CONN.cursor()
test
Define additional functions for reading and formatting CSV as pd.DataFrame
from datetime import datetime
def read_csv_datetime(csv: Path) -> pd.DataFrame:
"""Read CSV with parsing datetime index (months)
First CSV column: Year
Second CSV column: Month
"""
date_cols = ["year", "month"]
df = pd.read_csv(
csv, index_col='datetime',
parse_dates={'datetime':date_cols},
date_format='%Y %m',
keep_date_col='False')
df.drop(columns=date_cols, inplace=True)
return df
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
"""Calculate cardinality from HLL and append to extra column in df"""
df[cardinality_col] = df.apply(
lambda x: hll.cardinality_hll(
x[hll_col], CUR_HLL),
axis=1)
df.drop(columns=[hll_col], inplace=True)
return df
def filter_fill_time(
df: pd.DataFrame, min_year: int,
max_year: int, val_col: str = "postcount_est",
min_month: str = "01", max_month: str = "01", agg_base: str = None,
agg_method = None):
"""Filter time values between min - max year and fill missing values"""
max_day = "01"
if agg_base is None:
agg_base = "M"
elif agg_base == "Y":
max_month = "12"
max_day = "31"
min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
max_date = pd.Timestamp(f'{max_year}-{max_month}-{max_day}')
# clip by start and end date
if not min_date in df.index:
df.loc[min_date, val_col] = 0
if not max_date in df.index:
df.loc[max_date, val_col] = 0
df.sort_index(inplace=True)
# mask min and max time
time_mask = ((df.index >= min_date) & (df.index <= max_date))
resampled = df.loc[time_mask][val_col].resample(agg_base)
if agg_method is None:
series = resampled.sum()
elif agg_method == "count":
series = resampled.count()
elif agg_method == "nunique":
series = resampled.nunique()
# fill missing months with 0
# this will also set the day to max of month
return series.fillna(0).to_frame()
Select dataset to process below
Apply functions to all data sets.
def process_dataset(
dataset: Path = None, metric: str = None, df_post: pd.DataFrame = None,
min_year: int = None, max_year: int = None, agg_base: str = None) -> pd.DataFrame:
"""Apply temporal filter/pre-processing to all data sets."""
if metric is None:
metric = 'post_hll'
warn(f"Using default value {metric}")
if metric == 'post_hll':
cardinality_col = 'postcount_est'
else:
cardinality_col = 'usercount_est'
if min_year is None:
min_year = 2015
if max_year is None:
max_year = 2019
if df_post is None:
df_post = read_csv_datetime(dataset)
df_post = append_cardinality_df(df_post, metric, cardinality_col)
return filter_fill_time(df_post, min_year, max_year, cardinality_col, agg_base=agg_base)
%%time
df_post = process_dataset(MASS_INVASION_ALL, agg_base=AGG_BASE, metric='post_hll')
df_post.head(5)
%%time
df_user = process_dataset(MASS_INVASION_ALL, metric=metric_col, agg_base=AGG_BASE)
df_user.head(5)
Define plot function.
def fill_plot_time(
df: pd.DataFrame, ax: Axes, color: str,
label: str, val_col: str = "postcount_est") -> Axes:
"""Matplotlib Barplot with time axis formatting
If "significant" in df columns, applies different colors to fill/edge
of non-significant values.
"""
if color is None:
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
color_rgba = colors([1.0])[0]
color = mcolor.rgb2hex((color_rgba), keep_alpha=True)
color_significant = color
color_significant_edge = "white"
if "significant" in df.columns:
colors_bar = {True: color, False: "white"}
color_significant = df['significant'].replace(colors_bar)
colors_edge = {True: "white", False: "black"}
color_significant_edge = df['significant'].replace(colors_edge)
df_plot = df.set_index(
df.index.map(lambda s: s.strftime('%Y')))
ax = df_plot.plot(
ax=ax, y=val_col, color=color_significant,
label=label, linewidth=0.5, alpha=0.6)
ax.fill_between(range(len(df_plot.index)), df_plot[val_col], facecolor=color, alpha=0.6)
return ax
def plot_time(
df: Tuple[pd.DataFrame, pd.DataFrame], title, color = None, filename = None,
output = OUTPUT, legend: str = "Postcount", val_col: str = None,
trend: bool = None, seasonal: bool = None, residual: bool = None,
agg_base: str = None, fig = None, ax = None, return_fig_ax = None):
"""Create dataframe(s) time plot"""
x_ticks_every = 12
fig_x = 10
fig_y = 2
font_mod = True
x_label = "Year"
linewidth = 3
if agg_base and agg_base == "Y":
x_ticks_every = 1
fig_x = 3
fig_y = 1.5
font_mod = True
x_label = "Year"
linewidth = 1
if fig is None or ax is None:
fig, ax = plt.subplots()
fig.set_size_inches(fig_x, fig_y)
ylabel = f'{legend}'
if val_col is None:
val_col = f'{legend.lower()}_est'
ax = fill_plot_time(
df=df, ax=ax, color=color, val_col=val_col, label=legend)
# TODO: below is a bit hacky way to format the x-axis;
tick_loc = mticker.MultipleLocator(x_ticks_every)
ax.xaxis.set_major_locator(tick_loc)
ax.tick_params(axis='x', rotation=45, length=0.5)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
xrange_min_max = range(2015, 2021)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ax.set_xticklabels(xrange_min_max, rotation=45)
ax.set(xlabel=x_label, ylabel=ylabel)
ax.spines["left"].set_linewidth(0.25)
ax.spines["bottom"].set_linewidth(0.25)
ax.spines["top"].set_linewidth(0)
ax.spines["right"].set_linewidth(0)
ax.yaxis.set_tick_params(width=0.5)
# remove legend
ax.get_legend().remove()
ax.set_title(title)
ax.set_xlim(-0.5, len(df)-0.5)
ax.set_ylim(bottom=0)
if font_mod:
for item in (
[ax.xaxis.label, ax.title, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(8)
# store figure to file
if filename:
fig.savefig(
output / "figures" / f"{filename}.png", dpi=300, format='PNG',
bbox_inches='tight', pad_inches=1, facecolor="white")
# also save as svg
fig.savefig(
output / "svg" / f"{filename}.svg", format='svg',
bbox_inches='tight', pad_inches=1, facecolor="white")
if return_fig_ax:
return fig, ax
def load_and_plot(
dataset: Path = None, metric: str = None, src_ref: str = "flickr", colors: cm.colors.ListedColormap = None,
agg_base: str = None, trend: bool = None, return_df: bool = None, df_post: pd.DataFrame = None, return_fig_ax = None):
"""Load data and plot"""
if metric is None:
metric = 'post_hll'
if metric == 'post_hll':
metric_label = 'postcount'
else:
metric_label = 'usercount'
if colors is None:
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
colors = colors([1.0])
df = process_dataset(dataset, metric=metric, agg_base=agg_base, df_post=df_post)
fig, ax = plot_time(
df, legend=metric_label.capitalize(), color=colors,
title=f'{src_ref}',
filename=f"temporal_{metric_label}_{src_ref}_absolute", trend=trend, agg_base=agg_base, return_fig_ax=True)
fig.show()
if return_fig_ax:
return fig, ax
if return_df:
return df
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
fig, ax = load_and_plot(MASS_INVASION_ALL, src_ref=f"Mass invasion total posts 2015-2019, all 13 places", agg_base=AGG_BASE, trend=False, metric=metric_col, return_fig_ax=True)
fig.show()
df = pd.read_csv(MASS_INVASION_ALL)
df.tail()
mass_invasion_places = df['topic_group'].unique()
mass_invasion_places
def plot_bars(
df: pd.DataFrame, ax: matplotlib.axes = None, title: str = None):
"""Plot bars from two DataFrames"""
bar_param = {
"width":1.0,
"label":"Mass invasion total post count aggregated for months",
"edgecolor":"white",
"linewidth":0.5,
"alpha":1.0
}
# create figure
if not ax:
fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
# plot
df.groupby(df.index.month)["postcount_est"] \
.mean().plot.bar(ax=ax, color=color_instagram, y="postcount_est", **bar_param)
# format
ax.set_xlim(-0.5,11.5)
month_names = ['Jan','Feb','Mar','Apr','May','Jun',
'Jul','Aug','Sep','Oct','Nov','Dec']
ax.set_xticklabels(month_names)
ax.tick_params(axis='x', rotation=45, length=0) # length: of ticks
ax.spines["left"].set_linewidth(0.25)
ax.spines["bottom"].set_linewidth(0.25)
ax.spines["top"].set_linewidth(0)
ax.spines["right"].set_linewidth(0)
ax.yaxis.set_tick_params(width=0.5)
ax.set(xlabel="", ylabel="")
if not title:
title = "Post Count per Month (mean)"
ax.set_title(title, y=-0.2, pad=-14)
for item in (
[ax.xaxis.label, ax.title, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(8)
top_50_cnt = 13
top_50 = mass_invasion_places.tolist()
top_50.sort()
top_50
df_post = read_csv_datetime(MASS_INVASION_ALL)
# create figure object with multiple subplots
fig, axes = plt.subplots(nrows=int(top_50_cnt/3), ncols=4, figsize=(14, 11))
fig.subplots_adjust(hspace=.5) # adjust vertical space, to allow title below plot
# iterate places
for ix, ax in enumerate(axes.reshape(-1)):
if ix >= len(top_50):
ax.set_axis_off()
continue
np_str = top_50[ix]
# filter np_str and calculate cardinality
df_post_filter = df_post[df_post["topic_group"]==np_str].copy()
df_post_filter = append_cardinality_df(df_post_filter, 'post_hll', 'postcount_est')
df_post_filter = filter_fill_time(df_post_filter, 2010, 2023, 'postcount_est')
# plot bars individually
plot_bars(
df=df_post_filter, title=np_str, ax=ax)
tools.save_fig(fig, output=OUTPUT, name="barplot_massinvasion_seasonal")
def bar_plot_time(
df: pd.DataFrame, ax: Axes, color: str,
label: str, val_col: str = "postcount_est") -> Axes:
"""Matplotlib Barplot with time axis formatting
If "significant" in df columns, applies different colors to fill/edge
of non-significant values.
"""
if color is None:
color = color_instagram
color_significant = color
color_significant_edge = "white"
if "significant" in df.columns:
colors_bar = {True: color, False: "white"}
color_significant = df['significant'].replace(colors_bar)
colors_edge = {True: "white", False: "black"}
color_significant_edge = df['significant'].replace(colors_edge)
bar_param = {
"width":1.0,
"label":label,
"edgecolor":"white",
"linewidth":0.5,
"alpha":1.0
}
ax = df.set_index(
df.index.map(lambda s: s.strftime('%Y'))).plot.bar(
ax=ax, color=color_instagram, y="postcount_est", **bar_param)
return ax
def plot_time(
df: Tuple[pd.DataFrame, pd.DataFrame], title, color = None, filename = None,
output = OUTPUT, legend: str = "Postcount", val_col: str = None,
trend: bool = None, seasonal: bool = None, residual: bool = None,
agg_base: str = None, ax = None):
"""Create dataframe(s) time plot"""
x_ticks_every = 12
fig_x = 15.7
fig_y = 4.27
font_mod = False
x_label = ""
linewidth = 3
if agg_base and agg_base == "Y":
x_ticks_every = 1
fig_x = 3
fig_y = 1.5
font_mod = True
x_label = ""
linewidth = 1
if ax is None:
fig, ax = plt.subplots()
fig.set_size_inches(fig_x, fig_y)
ylabel = ""
if val_col is None:
val_col = f'{legend.lower()}_est'
ax = bar_plot_time(
df=df, ax=ax, color=color, val_col=val_col, label=legend)
# format
tick_loc = mticker.MultipleLocator(x_ticks_every)
ax.xaxis.set_major_locator(tick_loc)
ax.tick_params(axis='x', rotation=45, length=0)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
ax.set(xlabel=x_label, ylabel=ylabel)
ax.spines["left"].set_linewidth(0.25)
ax.spines["bottom"].set_linewidth(0.25)
ax.spines["top"].set_linewidth(0)
ax.spines["right"].set_linewidth(0)
ax.yaxis.set_tick_params(width=0.5)
# remove legend
ax.get_legend().remove()
ax.set_title(title, y=-0.2, pad=-14)
ax.set_xlim(-0.5,len(df)-0.5)
for item in (
[ax.xaxis.label, ax.title, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(8)
df_post = read_csv_datetime(MASS_INVASION_ALL)
# create figure object with multiple subplots
fig, axes = plt.subplots(nrows=int(top_50_cnt/3), ncols=4, figsize=(17, 8))
fig.subplots_adjust(hspace=.5) # adjust vertical space, to allow title below plot
# iterate places
for ix, ax in enumerate(axes.reshape(-1)):
if ix >= len(top_50):
ax.set_axis_off()
continue
np_str = top_50[ix]
# filter np_str and calculate cardinality
df_post_filter = df_post[df_post["topic_group"]==np_str].copy()
df_post_filter = append_cardinality_df(df_post_filter, 'post_hll', 'postcount_est')
df_post_filter = filter_fill_time(df_post_filter, 2015, 2019, 'postcount_est', max_month=8)
# plot bars individually
plot_time(
df=df_post_filter, title=np_str, ax=ax)
tools.save_fig(fig, output=OUTPUT, name="barplot_massinvasion_trend")
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./01_mass_invasion.ipynb \
--output 04_mass_invasion \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-