"""
The Inspector class, to handle all the operations of Variationist.
"""
import json
import os
import pandas as pd
import sys
from dataclasses import dataclass, asdict, field
from datasets import Dataset
from typing import Callable, List, Optional, Tuple, Union, Dict
from variationist import utils
from variationist.data import preprocess_utils
from variationist.data.tokenization import Tokenizer
from variationist.metrics import metrics
[docs]@dataclass
class InspectorArgs:
"""A dataclass to store all of the arguments that relate to the analysis.
Parameters
----------
text_names: List[str]
The list of names of text columns in the given dataset to use for the analysis.
var_names: List[str]
The list of variable names to use for the analysis. Each string in var_names
should correspond to a dataset column.
var_types: List[str]
The list of variable types corresponding to the variables in `var_names`. Should
match the length of `var_names`. Available choices are `nominal` (default),
`ordinal`, `quantitative`, and `coordinates`. These are mostly used for binning
and visualization.
var_semantics: List[str]
The list of variable semantics corresponding to the variables in `var_names`. Should match the length of `var_names`. Available choices are `general` (default),
`temporal`, and `spatial`. These are mostly used for binning and visualization.
var_bins: List[int]
The list of indices for variables that should be split into bins for the analysis.
Works with quantitative variables, dates and timestamps. Will default to 0 for each
specified variable, indicating 0 bins.
tokenizer: `str` or `Callable`, *optional*, defaults to `whitespace`
The tokenizer used to preprocess the data. Will default to whitespace tokenization
if not specified. Alternatively, it can be a string in the format "hf::tokenizer_name"
for loading a HuggingFace tokenizer. A custom function can also be passed for
tokenization. It should take as input an array of texts (assumed to be a Pandas Series)
and the InspectorArgs. It should return the same array but tokenized. Check out our
example notebooks for examples.
language: str
The language of the text in the dataset. Used for proper tokenization and stopword
removal.
metrics: List[str, Callable], *optional*
The list of metrics that should be calculated. It can be one of the metrics natively implemented by Variationist or a custom callable function.
n_tokens: Int
The number of tokens that should be considered for the analysis. 1 corresponds to unigrams, 2 corresponds to bigrams, and so on.
n_cooc: Int
The number of tokens used for calculating non-consecutive co-occurrences. For example, n=2 means we consider as the base units for our analysis any pair of tokens that co-occur in the same sentence. n=3 means we consider triplets of tokens, etc. Defaults to n=1, meaning no co-occurrences are taken into consideration, and we only consider
n-grams.
unique_cooc: Bool
Whether to consider unique co-occurrences or not. Default to False (keep duplicate tokens). If True, multiple occurrences of the same token in a text will be discarded. This does not affect the co-occurrences window size by design (the window size considers the original number of tokens and therefore the original allowed maximum distance between tokens).
cooc_window_size: Int
Size of the context window for co-occurrences. For instance, a `cooc_window_size` of 3 means we use a context window of 3 to calculate co-occurrences, meaning that any token that is within 3 tokens before or after a given token is added as a co-occurrence.
freq_cutoff: Int
The token frequency, expressed as an integer, below which we do not consider the token in the analysis of pmi-based metrics. Defaults to 3.
stopwords: Bool
Whether to remove stopwords from texts before tokenization or not (using default lists in a given `language`). Will default to False.
custom_stopwords: `str` or `List`, *optional*, defaults to `None`
A list of stopwords (or a path to a file containing stopwords, one per line) to be removed before tokenization. If `stopwords` is True, these stopwords will be added to that list. Will default to None.
lowercase: Bool
Whether to lowercase all the texts before tokenization or not. Will default to False.
ignore_null_var: Bool
Whether to proceed when null values are present for variables. Defaults to False, as this behavior can have unpredictable results. Set to True to treat "Nan" as any other variable value.
"""
text_names: Optional[List] = None # explicit column name(s)
var_names: Optional[List] = None # explicit variable name(s)
metrics: Optional[List] = None
var_types: Optional[List] = None # nominal (default), ordinal, quantitative, coordinates
var_semantics: Optional[List] = None # general (default), temporal, spatial
var_subsets: Optional[List] = None
var_bins: Optional[List] = None
tokenizer: Optional[Union[str, Callable]] = 'whitespace'
language: Optional[str] = None
n_tokens: Optional[int] = 1 # maximum value for this should be 5, otherwise the computation will explode
n_cooc: Optional[int] = 1
unique_cooc: Optional[bool] = False
cooc_window_size: Optional[int] = 0
freq_cutoff: Optional[int] = 3
stopwords: Optional[bool] = False
custom_stopwords: Optional[Union[str, list]] = None
lowercase: Optional[bool] = False
ignore_null_var: Optional[bool] = False
[docs] def check_values(self):
"""Checks the values in text_names, var_names and metrics."""
if self.text_names == None:
sys.exit("ERROR: No text_names were provided. These are the names or indices of the columns containing the text to be analyzed.")
if self.var_names == None:
sys.exit("ERROR: No var_names were provided. These are the names or indices of the columns containing the variables to be analyzed.")
if self.metrics == None:
print("WARNING: No metrics were defined. Variationist will assume only some basic dataset statistics are needed. Please consult the documentation to read what metrics are natively supported and how to use your own.")
self.metrics = ["basic-stats"]
[docs] def to_dict(self):
"""Returns the InspectorArgs values inside a dictionary."""
self_as_dict = asdict(self)
# convert any python objects into strings inside the dict
# so that it can later be converted to json
for i in range(len(self.metrics)):
if type(self.metrics[i]) is not str:
self_as_dict["metrics"][i] = self.metrics[i].__name__
if type(self.tokenizer) is not str:
self_as_dict["tokenizer"] = self.tokenizer.__name__
return self_as_dict
[docs]class Inspector:
"""
The Inspector class. It takes care of orchestrating the analysis, from importing and
tokenizing the data to calculating the metrics and creating an output file with all
the calculated metrics for each text column, variable, and combination thereof.
Parameters
----------
dataset: `datasets.Dataset` or `pandas.DataFrame` or `str`
The dataset to be used for our analysis. It can be a pre-loaded pandas dataframe,
or a string indicating a filepath to a .tsv, .csv file, or a Huggingface dataset.
Huggingface datasets can also be imported using strings, with the following format:
'hf::DATASET_NAME'.
args: `InspectorArguments`
The Inspector arguments. Refer to the InspectorArgs class for details on what these
should be.
"""
def __init__(
self,
dataset: Union[Dataset, pd.DataFrame, str] = None,
args: InspectorArgs = InspectorArgs(),
):
""""""
self.dataset = dataset
self.args = args
args.check_values()
# Set defaults for variable types and semantics in case they are not defined
if self.args.var_types == None:
default_type = "nominal"
self.args.var_types = [default_type] * len(self.args.var_names)
print(f"INFO: No values have been set for var_types. Defaults to {default_type}.")
if self.args.var_semantics == None:
default_semantics = "general"
self.args.var_semantics = [default_semantics] * len(self.args.var_names)
print(f"INFO: No values have been set for var_semantics. Defaults to {default_semantics}.")
if self.args.var_bins == None:
default_bin = 0
self.args.var_bins = [default_bin] * len(self.args.var_names)
# print(f"INFO: No values have been set for var_bins. Defaults to {default_bin}.")
# Dictionary for the metadata to be printed in the json output
metadata_dict = self.args.to_dict()
print("INFO: The metadata we will be using for the current analysis are:")
print(metadata_dict)
metadata_dict["dataset"] = self.dataset
self.metadata_dict = metadata_dict
# Check if variable definitions match in length
if any(len(args.var_names) != len(l) for l in [args.var_types, args.var_semantics, args.var_bins]):
sys.exit(f"ERROR! All variables in {args.var_names} should have an associated "
f"variable type, semantics, and bins. We instead got var_types: {args.var_types}, "
f"var_semantics: {args.var_semantics}, and var_bins: {args.var_bins}. Please provide "
f"an ordered list of types, semantics, and bins that match variable names "
f"and which have matching length for correct variable assignment.")
# Check if column strings are names or indices (for both texts and labels)
text_names_type = utils.check_column_type(args.text_names)
label_names_type = utils.check_column_type(args.var_names)
# Since the input file/dataset is the same, we require texts and labels columns to be of the same type
if text_names_type != label_names_type:
sys.exit(f"ERROR! text_cols are {text_names_type} while label_cols are {label_names_type}. "
"Please provide all column identifiers as names (as in the header line) or indices.")
self.cols_type = text_names_type
print(f"INFO: all column identifiers are treated as column {self.cols_type}.")
if type(self.dataset) is Dataset:
self.dataframe = pd.DataFrame(self.dataset)
self.metadata_dict["dataset"] = self.dataset.info.dataset_name
elif type(self.dataset) is pd.DataFrame:
try:
self.metadata_dict["dataset"] = self.dataset.name
except:
self.metadata_dict["dataset"] = "Custom_User_DataFrame"
self.dataframe = self.dataset
pass
elif type(self.dataset) is str:
self.dataframe = utils.convert_file_to_dataframe(self.dataset, cols_type=self.cols_type)
else:
sys.exit(f"The specified dataset is not one of the accepted ones (string, a pandas DataFrame or a Huggingface Dataset), but a type {type(self.dataset)} instead.")
# Create a dictionary containing the specified column strings (values) for texts and labels (keys)
self.col_names_dict = {
utils.TEXT_COLS_KEY: args.text_names,
utils.LABEL_COLS_KEY: args.var_names
}
# Instantiate the tokenizer
self.tokenizer = Tokenizer(self.args)
self.check_columns()
self.check_nan_values()
# Check if we need to bin or discretize any values
self.discretize = False
for i in range(len(self.args.var_names)):
if self.args.var_bins[i] != 0:
self.discretize = True
[docs] def check_columns(self):
"""A function to check that the specified text and variable columns are actually in
the provided dataset."""
# Check if the specified columns are actually in the dataframe
self.dataframe_cols = [col_name for col_name in self.dataframe.columns]
for col in self.args.text_names+self.args.var_names:
if col not in self.dataframe_cols:
sys.exit(f"ERROR: the '{col}' column is not present in the dataframe.")
[docs] def check_nan_values(self):
"""Checks if the specified variable columns contain Nan values and returns an error."""
for var in self.args.var_names:
nulls = self.dataframe[var].isnull()
if nulls.values.any():
if self.args.ignore_null_var:
print(f"INFO: One or more null values were found for the '{var}' variable. The indices (lines) of null values are {list(nulls[nulls].index)}. Since 'ignore_null_var' was set to True, Nan values will be treated as any other variable value. This might lead to unexpected results.")
else:
sys.exit(f"ERROR: One or more null values were found for the '{var}' variable. The indices (lines) of null values are {list(nulls[nulls].index)}. If you wish to ignore null values and proceed, please set 'ignore_null_var' to True when defining the InspectorArgs.")
[docs] def handle_bins_and_granularity(self):
"""For each variable that requires binning, checks that it can be carried out and calls
the dedicated function."""
for i in range(len(self.args.var_names)):
curr_var_name = self.args.var_names[i]
curr_bins = self.args.var_bins[i]
curr_type = self.args.var_types[i]
curr_sem = self.args.var_semantics[i]
curr_var_column = self.dataframe[curr_var_name]
if curr_bins != 0:
if (curr_type != "nominal"):
if (curr_type == "ordinal") and (curr_sem != "temporal"):
sys.exit(f"ERROR: var_bins was defined for variable {curr_var_name}, whose type is 'ordinal' but its semantics is not 'temporal'. However, ordinal values cannot be divided into bins if not of temporal semantics. If the {curr_var_name} variable is numeric, please specify another var_type for it. If it is an actual ordinal variable but not temporal, its var_bins value should be 0.")
if type(curr_bins) is int:
if curr_sem == "temporal":
curr_var_column = pd.to_datetime(curr_var_column)
print(f"INFO: For the variable {curr_var_name}, bins were defined. It will therefore be split into {curr_bins} equal bins.")
self.dataframe[curr_var_name] = preprocess_utils.discretize_bins_col(
curr_var_column, curr_bins
)
else:
sys.exit(f"ERROR: var_bins was defined, but not correctly. We expected a list of integer values for each variable (with 0 for variables where no binning is desired), but instead for the variable {curr_var_name} the input was of type {type(curr_bins).__name__}.")
else:
sys.exit(f"ERROR: var_bins was defined for variable {curr_var_name}, whose type is 'nominal'. However, nominal values cannot be divided into bins. If the {curr_var_name} variable is numeric, please specify another var_type for it. If it is an actual nominal variable, its var_bins value should be 0.")
[docs] def preprocess(self):
"""Performs all of the preprocessing operations of Variationist, such as grouping
together variables and dividing variables into bins."""
# Check if any discretization or binning should be carried out and do it
if self.discretize == True:
self.handle_bins_and_granularity()
label_values_dict = preprocess_utils.get_label_values(self.dataframe, self.col_names_dict)
if len(self.args.var_names) == 1 and len(self.args.text_names) == 1:
subsets_of_interest = preprocess_utils.get_subset_dict(self.dataframe,
self.tokenizer.tokenized_col_dict,
label_values_dict)
else:
# if we have more than two variables, we are interested in the intersections between them
subsets_of_interest = preprocess_utils.get_subset_intersections(self.dataframe,
self.tokenizer.tokenized_col_dict,
label_values_dict)
label_values_dict = preprocess_utils.update_label_values_dict_with_inters(
label_values_dict, self.args.text_names)
return label_values_dict, subsets_of_interest
[docs] def compute(self):
"""Main function carrying out the entire analysis pipeline. It creates a results dict
with the calculated metrics."""
label_values_dict, subsets_of_interest = self.preprocess()
results_dict = dict()
for metric in self.args.metrics:
current_metric = metrics.Metric(metric, self.args)
if type(metric) is not str:
metric_name = metric.__name__
else:
metric_name = metric
print(f"INFO: Currently calculating metric: '{metric_name}'")
results_dict[metric_name] = {}
if metric_name == "stats":
results_dict[metric_name] = current_metric.calculate_metric(
label_values_dict, subsets_of_interest)
else:
results_dict[metric_name][list(label_values_dict.keys())[0]] = current_metric.calculate_metric(
label_values_dict, subsets_of_interest)
self.results_dict = results_dict
return subsets_of_interest, results_dict
[docs] def create_output_dict(self):
"""Function to create the output dictionary, containing both metadata and calculated
metrics."""
output_dict = dict()
output_dict["metadata"] = self.metadata_dict
output_dict["metrics"] = self.results_dict
self.output_dict = output_dict
[docs] def inspect(self):
"""Wrapper function for tokenizing, carrying out computation, and saving the output
dictionary, which it returns."""
self.dataframe = self.tokenizer.tokenize(self.dataframe)
self.compute()
self.create_output_dict()
return self.output_dict
[docs] def save_output_to_json(self,
output_path = "output.json"
):
"""Saves the output dictionary to a json file, which can then be imported with the
Visualizer module."""
output_file = open(output_path, "w")
json.dump(self.output_dict, output_file, indent=4)
output_file.close()