Source code for variationist.data.tokenization

"""
The Tokenizer class, to handle all the tokenization-related operations of Variationist.
"""
import pandas as pd
import sys

from variationist.data import preprocess_utils, tokenization_utils
from variationist import utils


[docs]class Tokenizer: """A class that handles all the tokenization-related operations of Variationist. Parameters ---------- inspector_args: InspectorArgs The arguments that were passed to the Inspector. """ def __init__(self, inspector_args) -> None: self.args = inspector_args self.column_names_dict = { utils.TEXT_COLS_KEY: self.args.text_names, utils.LABEL_COLS_KEY: self.args.var_names } if callable(self.args.tokenizer): self.tok_function = self.args.tokenizer elif self.args.tokenizer.lower() == "whitespace": self.tok_function = tokenization_utils.whitespace_tokenization elif self.args.tokenizer.startswith("hf::"): self.tok_function = tokenization_utils.huggingface_tokenization else: sys.exit(f"The selected tokenizer ({self.args.tokenizer}) does not match any of the available options. If you intend to use a pretrained tokenizer from HuggingFace, please use the format 'hf::TOKENIZER_NAME'. Other available options are 'whitespace', and a callable function.") # TODO add the possibility to add a custom tokenizer as a function in inspectorargs.
[docs] def tokenize_column(self, text_column: pd.Series): """A function that tokenizes a text column using the selected tokenization function. It will also create n-grams and co-occurrences if requested by the user. It will then return the same text column, but tokenized/grouped according to the desired result. Parameters ---------- text_column: pandas.Series The series (text column) that should be tokenized. Returns ------- text_column: pandas.Series The same series as input, but tokenized/regrouped as requested. """ tokenized_text_column = self.tok_function(text_column, self.args) if (self.args.stopwords == True): if (self.args.language != None) or (self.args.custom_stopwords != None): tokenized_text_column = preprocess_utils.remove_stopwords( tokenized_text_column, self.args.language, self.args.custom_stopwords) else: print("WARNING: Stopword removal has been selected, but the \"language\"", "parameter has not been defined. Skipping stopword removal.") else: if (self.args.custom_stopwords != None): tokenized_text_column = preprocess_utils.remove_stopwords( tokenized_text_column, self.args.language, self.args.custom_stopwords) # print(tokenized_text_column) if self.args.n_tokens > 1: print("INFO: Creating n-grams...") tokenized_text_column = preprocess_utils.create_tokenized_ngrams_column(tokenized_text_column, self.args.n_tokens) if self.args.n_cooc > 1 and self.args.n_tokens <= 1: print("INFO: Creating co-occurrences...") tokenized_text_column = preprocess_utils.create_tokenized_cooccurrences_column(tokenized_text_column, self.args.n_cooc, self.args.cooc_window_size, self.args.unique_cooc) return tokenized_text_column
[docs] def tokenize(self, dataframe): """A wrapper function to tokenize each text column and add it to the original input dataframe as 'tok_ORIGINAL_TEXT_COL_NAME'. Returns the dataframe with the added tokenized columns. Parameters ---------- dataframe: pandas.DataFrame The dataframe that contains the data for the analysis Returns ------- dataframe: pandas.DataFrame The same dataframe as input, but with added columns containing the tokenized texts. """ tokenized_col_dict = {} for text_col in self.column_names_dict[utils.TEXT_COLS_KEY]: print(f"INFO: Tokenizing the {text_col} column...") tokenized_col_dict[text_col] = f"tok_{text_col}" dataframe[tokenized_col_dict[text_col]] = self.tokenize_column( dataframe[[str(text_col)]]) self.tokenized_col_dict = tokenized_col_dict return dataframe