Source code for variationist.visualizer

import altair as alt
import os
import pandas as pd
from typing import Any, Optional, Union

from variationist import utils
from variationist.visualization import chart_utils
from variationist.visualization.diversity_bar_chart import DiversityBarChart
from variationist.visualization.text_only_bar_chart import TextOnlyBarChart
from variationist.visualization.stats_bar_chart import StatsBarChart


[docs]class VisualizerArgs:
    """A class storing the arguments for the visualization component.
    
    Parameters
    ----------
        output_folder: Optional[str] = None
            A path to the output folder in which to store the charts and associated metadata. If the folder does not exist, it will be automatically created. If no path is provided, the charts will not be serialized and the possible output_formats will be ignored (in this case, the chart objects will be only accessible from the dictionary returned by the "create()" function and be shown by using the "show()" function.
        output_formats: Optional[list[str]] = ["html"]
            A list of output formats for the charts. By default, only the interactive HTML chart is saved, i.e., ["html"]. Extra choices: ["pdf", "svg", "png"].
        zoomable: Optional[bool] = True
            Whether the (HTML) chart should be zoomable using the mouse or not.
        top_per_class_ngrams: int = 20
            The maximum number of highest scoring per-class n-grams to show (for bar charts only). If set to None, it will show all the n-grams in the corpus (it may easily be overwhelming). By default is 20 to keep the visualization compact. This parameter is ignored when creating other chart types.
        ngrams: Optional[list[str]] = None
            A list of n-grams of interest to focus the resulting visualizations on.
            N-grams should match the number of tokens used in the prior computation
            reflected by the "results" variable (e.g., if unigrams were chosen, this
            list should only contain unigrams).
        shapefile_path: Optional[str] = None
            A path to the .shp shapefile to be visualized as background map to the chart
            (needed only when including a variable type "nominal" with "spatial" semantics.
            Note that auxiliary files to the .shp one (i.e., .dbf, .prg, .shx ones) are 
            required for chart creation too, but do not need to be specified. They should
            have the same name as the .shp file but different extension, and be located 
            in the same folder as the .shp file itself. An example of repository where to
            find shapefiles is https://geodata.lib.berkeley.edu/, but there exists many
            other ones and shapefiles provided by national/regional institutions.
        shapefile_var_name: Optional[str] = None
            The key field name in the shapefile which contains the names for the areas 
            which should match the possible values for the variable of interest (e.g., 
            if the variable of interest is "state", here should go the name of the
            variable name encoded in the shapefile containing the possible states)."""

    def __init__(
        self,
        output_folder: Optional[str] = None,
        output_formats: Optional[list[str]] = ["html"],
        zoomable: Optional[bool] = True,
        top_per_class_ngrams: Optional[int] = 20,
        ngrams: Optional[list[str]] = None,
        shapefile_path: Optional[str] = None,
        shapefile_var_name: Optional[str] = None,
    ) -> None:
        """
        A function that initializes the arguments useful for visualizing charts.
        """
        
        self.output_folder = output_folder
        self.output_formats = output_formats
        self.zoomable = zoomable
        self.top_per_class_ngrams = top_per_class_ngrams
        self.ngrams = ngrams
        self.shapefile_path = shapefile_path
        self.shapefile_var_name = shapefile_var_name


[docs]class Visualizer:
    """A class for the visualization component. It orchestrates the creation of charts 
    based on the results and metadata from a prior analysis using Variationist.
    
    Parameters
    ----------
        input_json: `str` or `dict`
            A path to the json file or a json/dict object storing metadata and results 
            from a prior analysis using Variationist.
        args: VisualizerArgs
            A VisualizerArgs object containing the arguments for the Visualizer"""

    def __init__(
        self,
        input_json: Union[str, dict],
        args: VisualizerArgs,
    ) -> None:
        """
        A function that initializes the arguments of the visualizer, the metadata, and
        the per-metric long-form dataframes that will be used for visualization.

        
        """

        # Store the visualizer arguments
        self.args = args
        self.metadata = dict()
        self.df_metric_data = dict()
        self.variable_names = dict()
        self.variable_values = dict()

        # Load the json object storing metadata and results
        json_data = utils.load_json_data_from_filepath_or_dict(input_json)

        # Get the metadata and variable names from the json
        self.metadata = json_data["metadata"]
        self.variable_names = self.metadata["var_names"]

        # Handle case in which 2 text columns are provided (@TODO: Thoroughly test)
        if (len(self.metadata["text_names"]) == 2):
            if (len(self.variable_names) >= 1):
                self.variable_names.insert(0, "text_name")
                self.metadata["var_types"].insert(0, "nominal")
                self.metadata["var_semantics"].insert(0, "general")
                self.metadata["var_bins"].insert(0, 0)
            else:
                self.variable_names = ["text_name::"]

        # Get per-metric long-form dataframes from the json
        for metric in self.metadata["metrics"]:
            # Store the concatenated string useful for multiple variables
            var_names_concat = utils.MULTI_VAR_SEP.join(self.variable_names)

            if metric == "stats":
                # Retrieve the possible values for the variable (combination) and the given metric
                self.variable_values[metric] = list(
                    json_data["metrics"][metric]["num_texts"][var_names_concat].keys())

                # Get the long-form dataframe
                self.df_metric_data[metric] = self.get_stats_df_from_json(
                    json_data = json_data["metrics"][metric], 
                    var_names_concat = var_names_concat)

            else:
                # Retrieve the possible values for the variable (combination) and the given metric
                self.variable_values[metric] = list(
                    json_data["metrics"][metric][var_names_concat].keys())

                # Get the long-form dataframe
                self.df_metric_data[metric] = self.get_df_from_json(
                    json_data = json_data["metrics"][metric], 
                    var_names_concat = var_names_concat,
                    top_per_class_ngrams = self.args.top_per_class_ngrams,
                    focus_ngrams = self.args.ngrams)


[docs]    def get_df_from_json(
        self,
        json_data: dict[str, Any],
        var_names_concat: str,
        top_per_class_ngrams: int,
        focus_ngrams: Optional[list[str]] = None,
    ) -> pd.core.frame.DataFrame:
        """
        A function that returns a long-form dataframe from a json which 
        stores the information about a prior analysis using Variationist.
        Optionally, it takes a list of n-grams to focus the filtering on.

        Parameters
        ----------
        json_data: dict[str, Any]
            The json object storing the results from a prior analysis in the form:
            {varA: {ngram1: value1, ngram2: value2, ...}, varB: {...}, ...}. Note
            that varA, varB, etc. could also take the form of "::"-concatenated
            variable names if multiple variables are present in the analysis.
        var_names_concat: str
            A string denoting the ordered concatenation of variable names (i.e., 
            original column names), separated by utils.MULTI_VAR_SEP, to be used for 
            giving meaningful names to the long-form dataframe.
        top_per_class_ngrams: int = 20
            The maximum number of highest scoring per-class n-grams to show (for bar
            charts only). If set to None, it will show all the n-grams in the corpus 
            (it may easily be overwhelming). By default is 20 to keep the visualization 
            compact.
        fucus_ngrams: list[str], *optional*, defaults to `None`
            A list of n-grams of interest to focus the filtering on. N-grams should 
            match the number of tokens used in the prior computation (e.g., if 
            unigrams were chosen, this list should only contain unigrams).

        Returns
        -------
        df_data: pd.core.frame.DataFrame
            A long-form dataframe storing the results of a prior analysis.
        """

        # Initialize the lists for variables, ngrams, and values
        variables, ngrams, values = dict(), [], []

        # Get the individual variables and initialize each of them
        var_names = var_names_concat.split(utils.MULTI_VAR_SEP)
        for var_name in var_names:
            variables[var_name] = []

        # Iterate through variable values and ngram-value pairs and keep those of interest
        for variable, raw_items in json_data[var_names_concat].items():
            for ngram, value in raw_items.items():
                if (focus_ngrams != None) and (ngram not in focus_ngrams):
                    continue
                else:
                    for i in range(len(var_names)):
                        variables[var_names[i]].append(str(variable).split(utils.MULTI_VAR_SEP)[i])
                    ngrams.append(ngram)
                    values.append(value)

        # Create the long-form dataframe
        dict_data = variables
        dict_data["ngram"] = ngrams
        dict_data["value"] = values
        df_data = pd.DataFrame(dict_data)
        
        return df_data


[docs]    def get_stats_df_from_json(
        self,
        json_data: dict[str, Any],
        var_names_concat: str,
    ) -> pd.core.frame.DataFrame:
        """
        A function that returns a long-form dataframe from a json which 
        stores the information about a prior analysis using Variationist.
        Optionally, it takes a list of n-grams to focus the filtering on.
        This is a variant of get_df_from_json() to handle basic stats.

        Parameters
        ----------
        json_data: dict[str, Any]
            The json object storing the results from a prior analysis in the form:
            {substatA: {colnameA: {varA: value1, ...}, ...}, substatB: {colnameA:
            {varA: {"mean": value, "stdev": value}, ...}, ...}, ...}. Note
            that varA, varB, etc. could also take the form of "::"-concatenated
            variable names if multiple variables are present in the analysis.
        var_names_concat: str
            A string denoting the ordered concatenation of variable names (i.e., 
            original column names), separated by utils.MULTI_VAR_SEP, to be used for 
            giving meaningful names to the long-form dataframe.

        Returns
        -------
        df_data: pd.core.frame.DataFrame
            A long-form dataframe storing the results of a prior analysis.
        """

        # Initialize the lists for variables, submetrics, and values
        variables, submetric_list, val_1_list, val_2_list = dict(), [], [], []

        # Get the individual variables and initialize each of them
        var_names = var_names_concat.split(utils.MULTI_VAR_SEP)
        for var_name in var_names:
            variables[var_name] = []

        # Iterate through the dictionary to create lists for creating the dataframe
        for submetric, vars_label_vals in json_data.items():
            for raw_vars, label_vals in vars_label_vals.items():
                for label, vals in label_vals.items():
                    is_vals_dict = (type(vals)==dict)
                    if is_vals_dict:
                        val_1, val_2 = vals["mean"], vals["stdev"]
                    else:
                        val_1, val_2 = vals, None

                    for i in range(len(var_names)):
                        variables[var_names[i]].append(str(label).split(utils.MULTI_VAR_SEP)[i])
                    submetric_list.append(submetric)
                    val_1_list.append(val_1)
                    val_2_list.append(val_2)

        # Create the long-form dataframe
        dict_data = variables
        dict_data["statistics"] = submetric_list
        dict_data["val_1"] = val_1_list
        dict_data["val_2"] = val_2_list
        df_data = pd.DataFrame(dict_data)

        return df_data


[docs]    def get_charts_metadata(
        self,
        metric: str,
    ) -> dict[str, Any]:
        """
        A function that returns a dictionary containing information on which and how to 
        create charts given prior analysis' var_types and var_semantics metadata.

        Parameters
        ----------
        metric: str
            The metric associated to the "df_data" dataframe and thus to the charts.

        Returns
        -------
        charts_metadata: dict[str, Any]
            A dictionary containing the chart types and information on how to create them.
        """

        # Get lists of attributes for variables
        var_names = self.metadata["var_names"]
        var_types = self.metadata["var_types"]
        var_semantics = self.metadata["var_semantics"]
        var_bins = self.metadata["var_bins"]

        # Double check the lengths of var_* (they must be the same)
        assert len(var_types) == len(var_semantics) == len(var_bins)

        # Skip if no variables have been defined (e.g., case 2 text columns only)
        if len(var_types) == 0:
            return {}

        # Check if there are variables and those are at maximum three
        if (1 <= len(var_types) <= 3):
            # Get the key for dimensions (the amount of dimensions equals to #variables+2)
            dims_key = str(len(var_types) + 2) + "-dims"

            # If there is only a variable, there is no need to order/join names, just take the values
            if len(var_types) == 1:
                var_types_key = var_types[0]
                var_semantics_key = var_semantics[0]
            # Otherwise, we need to create an ordered concatenation of variables for searching
            else:
                # If bins are used and the variable was originally quantitative, change it to nominal here
                as_nominal_idxs = []
                for i in range(len(var_types)):
                    if (var_types[i] == "quantitative") and (var_bins[i] != 0):
                        as_nominal_idxs.append(i)
                for as_nominal_idx in as_nominal_idxs:
                    var_types[as_nominal_idx] = "nominal"

                var_types_ord, var_semantics_ord = zip(*sorted(zip(var_types, var_semantics)))
                var_types_key = '-'.join([var_type for var_type in var_types_ord])
                var_semantics_key = '-'.join([var_semantics for var_semantics in var_semantics_ord])
            
            # Check if the variable type(s) are supported
            if var_types_key in chart_utils.VAR_CHARTS_MAP[dims_key]:
                # Check if the combination of the variable type(s) and semantics are supported
                # If yes, take the dictionary with chart building information
                if var_semantics_key in chart_utils.VAR_CHARTS_MAP[dims_key][var_types_key]:
                    charts_metadata = chart_utils.VAR_CHARTS_MAP[dims_key][var_types_key][var_semantics_key]
                # Otherwise, raise an error
                else:
                    raise ValueError(
                        f"Visualization for \"{var_types_key}\" variable type(s) and \"{var_semantics_key}\" "
                        f"variable semantics is currently not supported. If you have any idea to effectively "
                        f"visualize such combination of types and semantics, we would be happy if you let us "
                        f"know by opening an issue at: https://github.com/dhfbk/variationist/issues.")
            # Otherwise, raise an error
            else:
                raise ValueError(
                    f"Visualization for \"{var_type_key}\" variable type(s) is not supported.")

        # Otherwise, raise an error
        else:
            raise ValueError(
                f"Visualization for {len(var_types)} variable types is not supported yet.")
        
        return charts_metadata


[docs]    def create(
        self,
    ) -> dict[str, list[alt.Chart]]:
        """
        A function that orchestrates the creation of charts based on the results
        and metadata from a prior analysis using Variationist, returning a dictionary 
        of metrics (keys) and an associated list of alt.Chart objects (values).

        Returns
        -------
        charts: dict[str, list[alt.Chart]]
            A dictionary containing the metrics as keys and a list of chart objects
            as values.
        """

        # A dictionary holding the chart objects to be returned to the user
        # This is especially useful when the user would show the chart in a notebook
        charts = {}

        # Create a dictionary of chart-specific arguments
        extra_args = {}
        if self.args.shapefile_path != None:
            extra_args["shapefile_path"] = self.args.shapefile_path
        if self.args.shapefile_var_name != None:
            extra_args["shapefile_var_name"] = self.args.shapefile_var_name

        # Build chart objects for each computed metric based on variable types and
        # semantics, then save them to the user-specified output folder
        for metric, df_data in self.df_metric_data.items():
            charts[metric] = dict()

            if metric == "stats":
                # Create the chart object
                print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
                chart = StatsBarChart(
                    df_data, metric, self.metadata, extra_args, {}, 
                    self.args.zoomable, self.args.top_per_class_ngrams
                )

                # Save the chart to the output folder
                if self.args.output_folder != None:
                    output_filepath = os.path.join(self.args.output_folder, metric)
                    chart.save(output_filepath, "StatsBarChart", self.args.output_formats)

                # Add the chart to the dictionary of metric-associated charts
                charts[metric]["BarChart"] = chart.base_chart

            elif metric in ["ttr", "root_ttr", "log_ttr", "maas"]:
                # Create the chart object
                print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
                chart = DiversityBarChart(
                    df_data, metric, self.metadata, extra_args, {}, 
                    self.args.zoomable, self.args.top_per_class_ngrams
                )

                # Save the chart to the output folder
                if self.args.output_folder != None:
                    output_filepath = os.path.join(self.args.output_folder, metric)
                    chart.save(output_filepath, "DiversityBarChart", self.args.output_formats)

                # Add the chart to the dictionary of metric-associated charts
                charts[metric]["BarChart"] = chart.base_chart

            else:
                # Get dictionary containing information on which and how to create charts
                charts_metadata = self.get_charts_metadata(metric)

                if len(self.metadata["var_types"]) == 0:
                    print(f"INFO: Creating a BarChart object for metric \"{metric}\"...")
                    chart = TextOnlyBarChart(
                        df_data, metric, self.metadata, extra_args, {}, 
                        self.args.zoomable, self.args.top_per_class_ngrams
                    )
                            
                    # Save the chart to the output folder
                    if self.args.output_folder != None:
                        output_filepath = os.path.join(self.args.output_folder, metric)
                        chart.save(output_filepath, "BarChart", self.args.output_formats)

                        # Add the chart to the dictionary of metric-associated charts
                        charts[metric]["BarChart"] = chart.base_chart
                else:
                    # Iterate over the results and create and save charts based on these information
                    charts_count = 0
                    for ChartClass, chart_info in charts_metadata.items():
                        # Check if at least a variable has bins defined
                        no_bins = all(var_bin == 0 for var_bin in self.metadata["var_bins"])

                        # Create only the subset of charts based on bins definition
                        if (chart_info["for_bins"] == "any") or (no_bins and (chart_info["for_bins"] == False)) or ((no_bins == False) and (chart_info["for_bins"] == True)):
                            # Create the chart object
                            print(f"INFO: Creating a {ChartClass.__name__} object for metric \"{metric}\"...")
                            chart = ChartClass(
                                df_data, metric, self.metadata, extra_args, chart_info, 
                                self.args.zoomable, self.args.top_per_class_ngrams
                            )
                            
                            # Save the chart to the output folder
                            if self.args.output_folder != None:
                                output_filepath = os.path.join(self.args.output_folder, metric)
                                chart.save(output_filepath, ChartClass.__name__, self.args.output_formats)

                            # Add the chart to the dictionary of metric-associated charts
                            charts[metric][ChartClass.__name__] = chart.base_chart

                            charts_count += 1

                    if charts_count == 0:
                        print(f"No visualization is currently supported for the association metric(s) defined, but you can find the results in the output .json file.")

        return charts