Source code for variationist.inspector

"""
The Inspector class, to handle all the operations of Variationist.
"""
import json
import os
import pandas as pd
import sys
from dataclasses import dataclass, asdict, field
from datasets import Dataset
from typing import Callable, List, Optional, Tuple, Union, Dict

from variationist import utils
from variationist.data import preprocess_utils
from variationist.data.tokenization import Tokenizer
from variationist.metrics import metrics


[docs]@dataclass
class InspectorArgs:
    """A dataclass to store all of the arguments that relate to the analysis.
    
    Parameters
    ----------
        text_names: List[str]
            The list of names of text columns in the given dataset to use for the analysis. 
        var_names: List[str]
            The list of variable names to use for the analysis. Each string in var_names 
            should correspond to a dataset column.
        var_types: List[str]
            The list of variable types corresponding to the variables in `var_names`. Should 
            match the length of `var_names`. Available choices are `nominal` (default), 
            `ordinal`, `quantitative`, and `coordinates`. These are mostly used for binning 
            and visualization.
        var_semantics: List[str]
            The list of variable semantics corresponding to the variables in `var_names`. Should match the length of `var_names`. Available choices are `general` (default), 
            `temporal`, and `spatial`. These are mostly used for binning and visualization.
        var_bins: List[int]
            The list of indices for variables that should be split into bins for the analysis. 
            Works with quantitative variables, dates and timestamps. Will default to 0 for each 
            specified variable, indicating 0 bins.
        tokenizer: `str` or `Callable`, *optional*, defaults to `whitespace`
            The tokenizer used to preprocess the data. Will default to whitespace tokenization 
            if not specified. Alternatively, it can be a string in the format "hf::tokenizer_name" 
            for loading a HuggingFace tokenizer. A custom function can also be passed for 
            tokenization. It should take as input an array of texts (assumed to be a Pandas Series) 
            and the InspectorArgs. It should return the same array but tokenized. Check out our 
            example notebooks for examples.
        language: str
            The language of the text in the dataset. Used for proper tokenization and stopword 
            removal.
        metrics: List[str, Callable], *optional*
            The list of metrics that should be calculated. It can be one of the metrics natively implemented by Variationist or a custom callable function.
        n_tokens: Int
            The number of tokens that should be considered for the analysis. 1 corresponds to unigrams, 2 corresponds to bigrams, and so on.
        n_cooc: Int
            The number of tokens used for calculating non-consecutive co-occurrences. For example, n=2 means we consider as the base units for our analysis any pair of tokens that co-occur in the same sentence. n=3 means we consider triplets of tokens, etc. Defaults to n=1, meaning no co-occurrences are taken into consideration, and we only consider 
            n-grams.
        unique_cooc: Bool
            Whether to consider unique co-occurrences or not. Default to False (keep duplicate tokens). If True, multiple occurrences of the same token in a text will be discarded. This does not affect the co-occurrences window size by design (the window size considers the original number of tokens and therefore the original allowed maximum distance between tokens).
        cooc_window_size: Int
            Size of the context window for co-occurrences. For instance, a `cooc_window_size` of 3 means we use a context window of 3 to calculate co-occurrences, meaning that any token that is within 3 tokens before or after a given token is added as a co-occurrence.
        freq_cutoff: Int
            The token frequency, expressed as an integer, below which we do not consider the token in the analysis of pmi-based metrics. Defaults to 3.
        stopwords: Bool
            Whether to remove stopwords from texts before tokenization or not (using default lists in a given `language`). Will default to False.
        custom_stopwords: `str` or `List`, *optional*, defaults to `None`
            A list of stopwords (or a path to a file containing stopwords, one per line) to be removed before tokenization. If `stopwords` is True, these stopwords will be added to that list. Will default to None.
        lowercase: Bool
            Whether to lowercase all the texts before tokenization or not. Will default to False.
        ignore_null_var: Bool
            Whether to proceed when null values are present for variables. Defaults to False, as this behavior can have unpredictable results. Set to True to treat "Nan" as any other variable value.
    """
    
    text_names: Optional[List] = None # explicit column name(s)
    var_names: Optional[List] = None # explicit variable name(s)
    metrics: Optional[List] = None
    var_types: Optional[List] = None # nominal (default), ordinal, quantitative, coordinates
    var_semantics: Optional[List] = None # general (default), temporal, spatial
    var_subsets: Optional[List] = None
    var_bins: Optional[List] = None
    tokenizer: Optional[Union[str, Callable]] = 'whitespace'
    language: Optional[str] = None
    n_tokens: Optional[int] = 1 # maximum value for this should be 5, otherwise the computation will explode
    n_cooc: Optional[int] = 1
    unique_cooc: Optional[bool] = False
    cooc_window_size: Optional[int] = 0
    freq_cutoff: Optional[int] = 3
    stopwords: Optional[bool] = False
    custom_stopwords: Optional[Union[str, list]] = None
    lowercase: Optional[bool] = False
    ignore_null_var: Optional[bool] = False
    

[docs]    def check_values(self):
        """Checks the values in text_names, var_names and metrics."""

        if self.text_names == None:
            sys.exit("ERROR: No text_names were provided. These are the names or indices of the columns containing the text to be analyzed.")
        if self.var_names == None:
            sys.exit("ERROR: No var_names were provided. These are the names or indices of the columns containing the variables to be analyzed.")
        if self.metrics == None:
            print("WARNING: No metrics were defined. Variationist will assume only some basic dataset statistics are needed. Please consult the documentation to read what metrics are natively supported and how to use your own.")
            self.metrics = ["basic-stats"]
    

[docs]    def to_dict(self):
        """Returns the InspectorArgs values inside a dictionary."""

        self_as_dict = asdict(self)
        # convert any python objects into strings inside the dict
        # so that it can later be converted to json
        for i in range(len(self.metrics)):
            if type(self.metrics[i]) is not str:
                self_as_dict["metrics"][i] = self.metrics[i].__name__
        if type(self.tokenizer) is not str:
            self_as_dict["tokenizer"] = self.tokenizer.__name__

        return self_as_dict
    

[docs]class Inspector:
    """
    The Inspector class. It takes care of orchestrating the analysis, from importing and 
    tokenizing the data to calculating the metrics and creating an output file with all 
    the calculated metrics for each text column, variable, and combination thereof. 

    Parameters
    ----------
    dataset: `datasets.Dataset` or `pandas.DataFrame` or `str`
        The dataset to be used for our analysis. It can be a pre-loaded pandas dataframe, 
        or a string indicating a filepath to a .tsv, .csv file, or a Huggingface dataset. 
        Huggingface datasets can also be imported using strings, with the following format: 
        'hf::DATASET_NAME'.
    args: `InspectorArguments`
        The Inspector arguments. Refer to the InspectorArgs class for details on what these 
        should be.
        
    """

    def __init__(
        self,
        dataset: Union[Dataset, pd.DataFrame, str] = None,
        args: InspectorArgs = InspectorArgs(),
        ):
        """"""
        
        self.dataset = dataset
        self.args = args

        args.check_values()
        
        # Set defaults for variable types and semantics in case they are not defined
        if self.args.var_types == None:
            default_type = "nominal"
            self.args.var_types = [default_type] * len(self.args.var_names)
            print(f"INFO: No values have been set for var_types. Defaults to {default_type}.")
        if self.args.var_semantics == None:
            default_semantics = "general"
            self.args.var_semantics = [default_semantics] * len(self.args.var_names)
            print(f"INFO: No values have been set for var_semantics. Defaults to {default_semantics}.")
        if self.args.var_bins == None:
            default_bin = 0
            self.args.var_bins = [default_bin] * len(self.args.var_names)
            # print(f"INFO: No values have been set for var_bins. Defaults to {default_bin}.")

        # Dictionary for the metadata to be printed in the json output
        metadata_dict = self.args.to_dict()
        print("INFO: The metadata we will be using for the current analysis are:")
        print(metadata_dict)
        metadata_dict["dataset"] = self.dataset 
        self.metadata_dict = metadata_dict
        
        # Check if variable definitions match in length
        if any(len(args.var_names) != len(l) for l in [args.var_types, args.var_semantics, args.var_bins]):
            sys.exit(f"ERROR! All variables in {args.var_names} should have an associated "
                            f"variable type, semantics, and bins. We instead got var_types: {args.var_types}, "
                            f"var_semantics: {args.var_semantics}, and var_bins: {args.var_bins}. Please provide "
                            f"an ordered list of types, semantics, and bins that match variable names "
                            f"and which have matching length for correct variable assignment.")
        
        # Check if column strings are names or indices (for both texts and labels)
        text_names_type = utils.check_column_type(args.text_names)
        label_names_type = utils.check_column_type(args.var_names)
    
        # Since the input file/dataset is the same, we require texts and labels columns to be of the same type
        if text_names_type != label_names_type:
            sys.exit(f"ERROR! text_cols are {text_names_type} while label_cols are {label_names_type}. "
                            "Please provide all column identifiers as names (as in the header line) or indices.")
        self.cols_type = text_names_type
        print(f"INFO: all column identifiers are treated as column {self.cols_type}.")
        
        if type(self.dataset) is Dataset:
            self.dataframe = pd.DataFrame(self.dataset)
            self.metadata_dict["dataset"] = self.dataset.info.dataset_name
        elif type(self.dataset) is pd.DataFrame:
            try:
                self.metadata_dict["dataset"] = self.dataset.name
            except:
                self.metadata_dict["dataset"] = "Custom_User_DataFrame"
            self.dataframe = self.dataset
            pass
        elif type(self.dataset) is str:
            self.dataframe = utils.convert_file_to_dataframe(self.dataset, cols_type=self.cols_type)
        else:
            sys.exit(f"The specified dataset is not one of the accepted ones (string, a pandas DataFrame or a Huggingface Dataset), but a type {type(self.dataset)} instead.")
            
        
        # Create a dictionary containing the specified column strings (values) for texts and labels (keys)
        self.col_names_dict = {
            utils.TEXT_COLS_KEY: args.text_names,
            utils.LABEL_COLS_KEY: args.var_names
        }

        # Instantiate the tokenizer
        self.tokenizer = Tokenizer(self.args)
        
        self.check_columns()
        self.check_nan_values()
        
        # Check if we need to bin or discretize any values
        self.discretize = False
        for i in range(len(self.args.var_names)):
            if self.args.var_bins[i] != 0:
                self.discretize = True


[docs]    def check_columns(self):
        """A function to check that the specified text and variable columns are actually in 
        the provided dataset."""

        # Check if the specified columns are actually in the dataframe
        self.dataframe_cols = [col_name for col_name in self.dataframe.columns]
        for col in self.args.text_names+self.args.var_names:
            if col not in self.dataframe_cols:
                sys.exit(f"ERROR: the '{col}' column is not present in the dataframe.")
    
    
[docs]    def check_nan_values(self):
        """Checks if the specified variable columns contain Nan values and returns an error."""

        for var in self.args.var_names:
            nulls = self.dataframe[var].isnull()
            if nulls.values.any():
                if self.args.ignore_null_var:
                    print(f"INFO: One or more null values were found for the '{var}' variable. The indices (lines) of null values are {list(nulls[nulls].index)}. Since 'ignore_null_var' was set to True, Nan values will be treated as any other variable value. This might lead to unexpected results.")
                else:
                    sys.exit(f"ERROR: One or more null values were found for the '{var}' variable. The indices (lines) of null values are {list(nulls[nulls].index)}. If you wish to ignore null values and proceed, please set 'ignore_null_var' to True when defining the InspectorArgs.")
    
            
[docs]    def handle_bins_and_granularity(self):
        """For each variable that requires binning, checks that it can be carried out and calls 
        the dedicated function."""

        for i in range(len(self.args.var_names)):
            curr_var_name = self.args.var_names[i]
            curr_bins = self.args.var_bins[i]
            curr_type = self.args.var_types[i]
            curr_sem = self.args.var_semantics[i]
            curr_var_column = self.dataframe[curr_var_name]
            if curr_bins != 0:
                if (curr_type != "nominal"):
                    if (curr_type == "ordinal") and (curr_sem != "temporal"):
                        sys.exit(f"ERROR: var_bins was defined for variable {curr_var_name}, whose type is 'ordinal' but its semantics is not 'temporal'. However, ordinal values cannot be divided into bins if not of temporal semantics. If the {curr_var_name} variable is numeric, please specify another var_type for it. If it is an actual ordinal variable but not temporal, its var_bins value should be 0.")
                    if type(curr_bins) is int:
                        if curr_sem == "temporal":
                            curr_var_column = pd.to_datetime(curr_var_column)
                        print(f"INFO: For the variable {curr_var_name}, bins were defined. It will therefore be split into {curr_bins} equal bins.")
                        self.dataframe[curr_var_name] = preprocess_utils.discretize_bins_col(
                            curr_var_column, curr_bins
                        )
                    else:
                        sys.exit(f"ERROR: var_bins was defined, but not correctly. We expected a list of integer values for each variable (with 0 for variables where no binning is desired), but instead for the variable {curr_var_name} the input was of type {type(curr_bins).__name__}.")
                else:
                    sys.exit(f"ERROR: var_bins was defined for variable {curr_var_name}, whose type is 'nominal'. However, nominal values cannot be divided into bins. If the {curr_var_name} variable is numeric, please specify another var_type for it. If it is an actual nominal variable, its var_bins value should be 0.")


[docs]    def preprocess(self):
        """Performs all of the preprocessing operations of Variationist, such as grouping 
        together variables and dividing variables into bins."""

        # Check if any discretization or binning should be carried out and do it
        if self.discretize == True:
            self.handle_bins_and_granularity()
        
        label_values_dict = preprocess_utils.get_label_values(self.dataframe, self.col_names_dict)
        if len(self.args.var_names) == 1 and  len(self.args.text_names) == 1:
            subsets_of_interest = preprocess_utils.get_subset_dict(self.dataframe,
                                                    self.tokenizer.tokenized_col_dict,
                                                    label_values_dict)
        else:        
            # if we have more than two variables, we are interested in the intersections between them
            subsets_of_interest = preprocess_utils.get_subset_intersections(self.dataframe,
                                                    self.tokenizer.tokenized_col_dict,
                                                    label_values_dict)
            label_values_dict = preprocess_utils.update_label_values_dict_with_inters(
                label_values_dict, self.args.text_names)
        
        return label_values_dict, subsets_of_interest


[docs]    def compute(self):
        """Main function carrying out the entire analysis pipeline. It creates a results dict 
        with the calculated metrics."""

        label_values_dict, subsets_of_interest = self.preprocess()
        
        results_dict = dict()
        for metric in self.args.metrics:
            current_metric = metrics.Metric(metric, self.args)
            if type(metric) is not str:
                metric_name = metric.__name__
            else:
                metric_name = metric
            print(f"INFO: Currently calculating metric: '{metric_name}'")
            results_dict[metric_name] = {}
            
            if metric_name == "stats":
                results_dict[metric_name] = current_metric.calculate_metric(
                    label_values_dict, subsets_of_interest)
            else:
                results_dict[metric_name][list(label_values_dict.keys())[0]] = current_metric.calculate_metric(
                    label_values_dict, subsets_of_interest)
            
        self.results_dict = results_dict

        return subsets_of_interest, results_dict

    
[docs]    def create_output_dict(self):
        """Function to create the output dictionary, containing both metadata and calculated 
        metrics."""

        output_dict = dict()
        output_dict["metadata"] = self.metadata_dict
        output_dict["metrics"] = self.results_dict
        self.output_dict = output_dict
    

[docs]    def inspect(self):
        """Wrapper function for tokenizing, carrying out computation, and saving the output 
        dictionary, which it returns."""

        self.dataframe = self.tokenizer.tokenize(self.dataframe)
        self.compute()
        self.create_output_dict()

        return self.output_dict


[docs]    def save_output_to_json(self,
                            output_path = "output.json"
                            ):
        """Saves the output dictionary to a json file, which can then be imported with the 
        Visualizer module."""

        output_file = open(output_path, "w")
        json.dump(self.output_dict, output_file, indent=4)
        output_file.close()