Source code for variationist.data.tokenization

"""
The Tokenizer class, to handle all the tokenization-related operations of Variationist.
"""
import pandas as pd
import sys

from variationist.data import preprocess_utils, tokenization_utils
from variationist import utils


[docs]class Tokenizer:
    """A class that handles all the tokenization-related operations of Variationist.
    
    Parameters
    ----------
    inspector_args: InspectorArgs
        The arguments that were passed to the Inspector.
    """
    
    def __init__(self, 
                 inspector_args) -> None:
        self.args = inspector_args
        
        self.column_names_dict = {
            utils.TEXT_COLS_KEY: self.args.text_names,
            utils.LABEL_COLS_KEY: self.args.var_names
        }
        if callable(self.args.tokenizer):
            self.tok_function = self.args.tokenizer
        elif self.args.tokenizer.lower() == "whitespace":
            self.tok_function = tokenization_utils.whitespace_tokenization
        elif self.args.tokenizer.startswith("hf::"):
            self.tok_function = tokenization_utils.huggingface_tokenization
        else:
            sys.exit(f"The selected tokenizer ({self.args.tokenizer}) does not match any of the available options. If you intend to use a pretrained tokenizer from HuggingFace, please use the format 'hf::TOKENIZER_NAME'. Other available options are 'whitespace', and a callable function.")
        # TODO add the possibility to add a custom tokenizer as a function in inspectorargs.
    
    
[docs]    def tokenize_column(self, 
                        text_column: pd.Series):
        """A function that tokenizes a text column using the selected tokenization function. It will also create n-grams and co-occurrences if requested by the user. It will then return the same text column, but tokenized/grouped according to the desired result.
        
        Parameters
        ----------
        text_column: pandas.Series
            The series (text column) that should be tokenized.
            
        Returns
        -------
        text_column: pandas.Series
            The same series as input, but tokenized/regrouped as requested.
             
        """
        tokenized_text_column = self.tok_function(text_column, self.args)

        if (self.args.stopwords == True):
            if (self.args.language != None) or (self.args.custom_stopwords != None):
                tokenized_text_column = preprocess_utils.remove_stopwords(
                    tokenized_text_column, self.args.language, self.args.custom_stopwords)
            else:
                print("WARNING: Stopword removal has been selected, but the \"language\"",
                    "parameter has not been defined. Skipping stopword removal.")
        else:
            if (self.args.custom_stopwords != None):
                tokenized_text_column = preprocess_utils.remove_stopwords(
                    tokenized_text_column, self.args.language, self.args.custom_stopwords)

        # print(tokenized_text_column)    
        if self.args.n_tokens > 1:
            print("INFO: Creating n-grams...")
            tokenized_text_column = preprocess_utils.create_tokenized_ngrams_column(tokenized_text_column, self.args.n_tokens)
        
        if self.args.n_cooc > 1 and self.args.n_tokens <= 1:
            print("INFO: Creating co-occurrences...")
            tokenized_text_column = preprocess_utils.create_tokenized_cooccurrences_column(tokenized_text_column, self.args.n_cooc, self.args.cooc_window_size, self.args.unique_cooc)
        return tokenized_text_column
    

[docs]    def tokenize(self, dataframe):
        """A wrapper function to tokenize each text column and add it to the original input dataframe as 'tok_ORIGINAL_TEXT_COL_NAME'. Returns the dataframe with the added tokenized columns.
        
        Parameters
        ----------
        dataframe: pandas.DataFrame
            The dataframe that contains the data for the analysis
            
        Returns
        -------
        dataframe: pandas.DataFrame
            The same dataframe as input, but with added columns containing the tokenized texts.
        """
        tokenized_col_dict = {}
        for text_col in self.column_names_dict[utils.TEXT_COLS_KEY]:
            print(f"INFO: Tokenizing the {text_col} column...")
            tokenized_col_dict[text_col] = f"tok_{text_col}"
            dataframe[tokenized_col_dict[text_col]] = self.tokenize_column(
                dataframe[[str(text_col)]])
        self.tokenized_col_dict = tokenized_col_dict
        return dataframe