import altair as alt
import os
import pandas as pd
from typing import Any, Optional, Union
from variationist import utils
from variationist.visualization import chart_utils
from variationist.visualization.diversity_bar_chart import DiversityBarChart
from variationist.visualization.text_only_bar_chart import TextOnlyBarChart
from variationist.visualization.stats_bar_chart import StatsBarChart
[docs]class VisualizerArgs:
"""A class storing the arguments for the visualization component.
Parameters
----------
output_folder: Optional[str] = None
A path to the output folder in which to store the charts and associated metadata. If the folder does not exist, it will be automatically created. If no path is provided, the charts will not be serialized and the possible output_formats will be ignored (in this case, the chart objects will be only accessible from the dictionary returned by the "create()" function and be shown by using the "show()" function.
output_formats: Optional[list[str]] = ["html"]
A list of output formats for the charts. By default, only the interactive HTML chart is saved, i.e., ["html"]. Extra choices: ["pdf", "svg", "png"].
zoomable: Optional[bool] = True
Whether the (HTML) chart should be zoomable using the mouse or not.
top_per_class_ngrams: int = 20
The maximum number of highest scoring per-class n-grams to show (for bar charts only). If set to None, it will show all the n-grams in the corpus (it may easily be overwhelming). By default is 20 to keep the visualization compact. This parameter is ignored when creating other chart types.
ngrams: Optional[list[str]] = None
A list of n-grams of interest to focus the resulting visualizations on.
N-grams should match the number of tokens used in the prior computation
reflected by the "results" variable (e.g., if unigrams were chosen, this
list should only contain unigrams).
shapefile_path: Optional[str] = None
A path to the .shp shapefile to be visualized as background map to the chart
(needed only when including a variable type "nominal" with "spatial" semantics.
Note that auxiliary files to the .shp one (i.e., .dbf, .prg, .shx ones) are
required for chart creation too, but do not need to be specified. They should
have the same name as the .shp file but different extension, and be located
in the same folder as the .shp file itself. An example of repository where to
find shapefiles is https://geodata.lib.berkeley.edu/, but there exists many
other ones and shapefiles provided by national/regional institutions.
shapefile_var_name: Optional[str] = None
The key field name in the shapefile which contains the names for the areas
which should match the possible values for the variable of interest (e.g.,
if the variable of interest is "state", here should go the name of the
variable name encoded in the shapefile containing the possible states)."""
def __init__(
self,
output_folder: Optional[str] = None,
output_formats: Optional[list[str]] = ["html"],
zoomable: Optional[bool] = True,
top_per_class_ngrams: Optional[int] = 20,
ngrams: Optional[list[str]] = None,
shapefile_path: Optional[str] = None,
shapefile_var_name: Optional[str] = None,
) -> None:
"""
A function that initializes the arguments useful for visualizing charts.
"""
self.output_folder = output_folder
self.output_formats = output_formats
self.zoomable = zoomable
self.top_per_class_ngrams = top_per_class_ngrams
self.ngrams = ngrams
self.shapefile_path = shapefile_path
self.shapefile_var_name = shapefile_var_name
[docs]class Visualizer:
"""A class for the visualization component. It orchestrates the creation of charts
based on the results and metadata from a prior analysis using Variationist.
Parameters
----------
input_json: `str` or `dict`
A path to the json file or a json/dict object storing metadata and results
from a prior analysis using Variationist.
args: VisualizerArgs
A VisualizerArgs object containing the arguments for the Visualizer"""
def __init__(
self,
input_json: Union[str, dict],
args: VisualizerArgs,
) -> None:
"""
A function that initializes the arguments of the visualizer, the metadata, and
the per-metric long-form dataframes that will be used for visualization.
"""
# Store the visualizer arguments
self.args = args
self.metadata = dict()
self.df_metric_data = dict()
self.variable_names = dict()
self.variable_values = dict()
# Load the json object storing metadata and results
json_data = utils.load_json_data_from_filepath_or_dict(input_json)
# Get the metadata and variable names from the json
self.metadata = json_data["metadata"]
self.variable_names = self.metadata["var_names"]
# Handle case in which 2 text columns are provided (@TODO: Thoroughly test)
if (len(self.metadata["text_names"]) == 2):
if (len(self.variable_names) >= 1):
self.variable_names.insert(0, "text_name")
self.metadata["var_types"].insert(0, "nominal")
self.metadata["var_semantics"].insert(0, "general")
self.metadata["var_bins"].insert(0, 0)
else:
self.variable_names = ["text_name::"]
# Get per-metric long-form dataframes from the json
for metric in self.metadata["metrics"]:
# Store the concatenated string useful for multiple variables
var_names_concat = utils.MULTI_VAR_SEP.join(self.variable_names)
if metric == "stats":
# Retrieve the possible values for the variable (combination) and the given metric
self.variable_values[metric] = list(
json_data["metrics"][metric]["num_texts"][var_names_concat].keys())
# Get the long-form dataframe
self.df_metric_data[metric] = self.get_stats_df_from_json(
json_data = json_data["metrics"][metric],
var_names_concat = var_names_concat)
else:
# Retrieve the possible values for the variable (combination) and the given metric
self.variable_values[metric] = list(
json_data["metrics"][metric][var_names_concat].keys())
# Get the long-form dataframe
self.df_metric_data[metric] = self.get_df_from_json(
json_data = json_data["metrics"][metric],
var_names_concat = var_names_concat,
top_per_class_ngrams = self.args.top_per_class_ngrams,
focus_ngrams = self.args.ngrams)
[docs] def get_df_from_json(
self,
json_data: dict[str, Any],
var_names_concat: str,
top_per_class_ngrams: int,
focus_ngrams: Optional[list[str]] = None,
) -> pd.core.frame.DataFrame:
"""
A function that returns a long-form dataframe from a json which
stores the information about a prior analysis using Variationist.
Optionally, it takes a list of n-grams to focus the filtering on.
Parameters
----------
json_data: dict[str, Any]
The json object storing the results from a prior analysis in the form:
{varA: {ngram1: value1, ngram2: value2, ...}, varB: {...}, ...}. Note
that varA, varB, etc. could also take the form of "::"-concatenated
variable names if multiple variables are present in the analysis.
var_names_concat: str
A string denoting the ordered concatenation of variable names (i.e.,
original column names), separated by utils.MULTI_VAR_SEP, to be used for
giving meaningful names to the long-form dataframe.
top_per_class_ngrams: int = 20
The maximum number of highest scoring per-class n-grams to show (for bar
charts only). If set to None, it will show all the n-grams in the corpus
(it may easily be overwhelming). By default is 20 to keep the visualization
compact.
fucus_ngrams: list[str], *optional*, defaults to `None`
A list of n-grams of interest to focus the filtering on. N-grams should
match the number of tokens used in the prior computation (e.g., if
unigrams were chosen, this list should only contain unigrams).
Returns
-------
df_data: pd.core.frame.DataFrame
A long-form dataframe storing the results of a prior analysis.
"""
# Initialize the lists for variables, ngrams, and values
variables, ngrams, values = dict(), [], []
# Get the individual variables and initialize each of them
var_names = var_names_concat.split(utils.MULTI_VAR_SEP)
for var_name in var_names:
variables[var_name] = []
# Iterate through variable values and ngram-value pairs and keep those of interest
for variable, raw_items in json_data[var_names_concat].items():
for ngram, value in raw_items.items():
if (focus_ngrams != None) and (ngram not in focus_ngrams):
continue
else:
for i in range(len(var_names)):
variables[var_names[i]].append(str(variable).split(utils.MULTI_VAR_SEP)[i])
ngrams.append(ngram)
values.append(value)
# Create the long-form dataframe
dict_data = variables
dict_data["ngram"] = ngrams
dict_data["value"] = values
df_data = pd.DataFrame(dict_data)
return df_data
[docs] def get_stats_df_from_json(
self,
json_data: dict[str, Any],
var_names_concat: str,
) -> pd.core.frame.DataFrame:
"""
A function that returns a long-form dataframe from a json which
stores the information about a prior analysis using Variationist.
Optionally, it takes a list of n-grams to focus the filtering on.
This is a variant of get_df_from_json() to handle basic stats.
Parameters
----------
json_data: dict[str, Any]
The json object storing the results from a prior analysis in the form:
{substatA: {colnameA: {varA: value1, ...}, ...}, substatB: {colnameA:
{varA: {"mean": value, "stdev": value}, ...}, ...}, ...}. Note
that varA, varB, etc. could also take the form of "::"-concatenated
variable names if multiple variables are present in the analysis.
var_names_concat: str
A string denoting the ordered concatenation of variable names (i.e.,
original column names), separated by utils.MULTI_VAR_SEP, to be used for
giving meaningful names to the long-form dataframe.
Returns
-------
df_data: pd.core.frame.DataFrame
A long-form dataframe storing the results of a prior analysis.
"""
# Initialize the lists for variables, submetrics, and values
variables, submetric_list, val_1_list, val_2_list = dict(), [], [], []
# Get the individual variables and initialize each of them
var_names = var_names_concat.split(utils.MULTI_VAR_SEP)
for var_name in var_names:
variables[var_name] = []
# Iterate through the dictionary to create lists for creating the dataframe
for submetric, vars_label_vals in json_data.items():
for raw_vars, label_vals in vars_label_vals.items():
for label, vals in label_vals.items():
is_vals_dict = (type(vals)==dict)
if is_vals_dict:
val_1, val_2 = vals["mean"], vals["stdev"]
else:
val_1, val_2 = vals, None
for i in range(len(var_names)):
variables[var_names[i]].append(str(label).split(utils.MULTI_VAR_SEP)[i])
submetric_list.append(submetric)
val_1_list.append(val_1)
val_2_list.append(val_2)
# Create the long-form dataframe
dict_data = variables
dict_data["statistics"] = submetric_list
dict_data["val_1"] = val_1_list
dict_data["val_2"] = val_2_list
df_data = pd.DataFrame(dict_data)
return df_data
[docs] def create(
self,
) -> dict[str, list[alt.Chart]]:
"""
A function that orchestrates the creation of charts based on the results
and metadata from a prior analysis using Variationist, returning a dictionary
of metrics (keys) and an associated list of alt.Chart objects (values).
Returns
-------
charts: dict[str, list[alt.Chart]]
A dictionary containing the metrics as keys and a list of chart objects
as values.
"""
# A dictionary holding the chart objects to be returned to the user
# This is especially useful when the user would show the chart in a notebook
charts = {}
# Create a dictionary of chart-specific arguments
extra_args = {}
if self.args.shapefile_path != None:
extra_args["shapefile_path"] = self.args.shapefile_path
if self.args.shapefile_var_name != None:
extra_args["shapefile_var_name"] = self.args.shapefile_var_name
# Build chart objects for each computed metric based on variable types and
# semantics, then save them to the user-specified output folder
for metric, df_data in self.df_metric_data.items():
charts[metric] = dict()
if metric == "stats":
# Create the chart object
print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
chart = StatsBarChart(
df_data, metric, self.metadata, extra_args, {},
self.args.zoomable, self.args.top_per_class_ngrams
)
# Save the chart to the output folder
if self.args.output_folder != None:
output_filepath = os.path.join(self.args.output_folder, metric)
chart.save(output_filepath, "StatsBarChart", self.args.output_formats)
# Add the chart to the dictionary of metric-associated charts
charts[metric]["BarChart"] = chart.base_chart
elif metric in ["ttr", "root_ttr", "log_ttr", "maas"]:
# Create the chart object
print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
chart = DiversityBarChart(
df_data, metric, self.metadata, extra_args, {},
self.args.zoomable, self.args.top_per_class_ngrams
)
# Save the chart to the output folder
if self.args.output_folder != None:
output_filepath = os.path.join(self.args.output_folder, metric)
chart.save(output_filepath, "DiversityBarChart", self.args.output_formats)
# Add the chart to the dictionary of metric-associated charts
charts[metric]["BarChart"] = chart.base_chart
else:
# Get dictionary containing information on which and how to create charts
charts_metadata = self.get_charts_metadata(metric)
if len(self.metadata["var_types"]) == 0:
print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
chart = TextOnlyBarChart(
df_data, metric, self.metadata, extra_args, {},
self.args.zoomable, self.args.top_per_class_ngrams
)
# Save the chart to the output folder
if self.args.output_folder != None:
output_filepath = os.path.join(self.args.output_folder, metric)
chart.save(output_filepath, "BarChart", self.args.output_formats)
# Add the chart to the dictionary of metric-associated charts
charts[metric]["BarChart"] = chart.base_chart
else:
# Iterate over the results and create and save charts based on these information
charts_count = 0
for ChartClass, chart_info in charts_metadata.items():
# Check if at least a variable has bins defined
no_bins = all(var_bin == 0 for var_bin in self.metadata["var_bins"])
# Create only the subset of charts based on bins definition
if (chart_info["for_bins"] == "any") or (no_bins and (chart_info["for_bins"] == False)) or ((no_bins == False) and (chart_info["for_bins"] == True)):
# Create the chart object
print(f"INFO: Creating a {ChartClass.__name__} object for metric \"{metric}\"...")
chart = ChartClass(
df_data, metric, self.metadata, extra_args, chart_info,
self.args.zoomable, self.args.top_per_class_ngrams
)
# Save the chart to the output folder
if self.args.output_folder != None:
output_filepath = os.path.join(self.args.output_folder, metric)
chart.save(output_filepath, ChartClass.__name__, self.args.output_formats)
# Add the chart to the dictionary of metric-associated charts
charts[metric][ChartClass.__name__] = chart.base_chart
charts_count += 1
if charts_count == 0:
print(f"No visualization is currently supported for the association metric(s) defined, but you can find the results in the output .json file.")
return charts