_utils.py 4.54 KB

Raw Blame History Permalink

import random
import time
from typing import Union


def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str:
    """Truncate a document to a certain length.

    If you want to add a custom tokenizer, then it will need to have a `decode` and
    `encode` method. An example would be the following custom tokenizer:

    ```python
    class Tokenizer:
        'A custom tokenizer that splits on commas'
        def encode(self, doc):
            return doc.split(",")

        def decode(self, doc_chunks):
            return ",".join(doc_chunks)
    ```

    You can use this tokenizer by passing it to the `tokenizer` parameter.

    Arguments:
        topic_model: A BERTopic model
        doc_length: The maximum length of each document. If a document is longer,
                    it will be truncated. If None, the entire document is passed.
        tokenizer: The tokenizer used to calculate to split the document into segments
                   used to count the length of a document.
                       * If tokenizer is 'char', then the document is split up
                         into characters which are counted to adhere to `doc_length`
                       * If tokenizer is 'whitespace', the document is split up
                         into words separated by whitespaces. These words are counted
                         and truncated depending on `doc_length`
                       * If tokenizer is 'vectorizer', then the internal CountVectorizer
                         is used to tokenize the document. These tokens are counted
                         and truncated depending on `doc_length`. They are decoded with
                         whitespaces.
                       * If tokenizer is a callable, then that callable is used to tokenize
                         the document. These tokens are counted and truncated depending
                         on `doc_length`
        document: A single document

    Returns:
        truncated_document: A truncated document
    """
    if doc_length is not None:
        if tokenizer == "char":
            truncated_document = document[:doc_length]
        elif tokenizer == "whitespace":
            truncated_document = " ".join(document.split()[:doc_length])
        elif tokenizer == "vectorizer":
            tokenizer = topic_model.vectorizer_model.build_tokenizer()
            truncated_document = " ".join(tokenizer(document)[:doc_length])
        elif hasattr(tokenizer, "encode") and hasattr(tokenizer, "decode"):
            encoded_document = tokenizer.encode(document)
            truncated_document = tokenizer.decode(encoded_document[:doc_length])
        return truncated_document
    return document


def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]:
    """Validates parameters that are used in the function `truncate_document`."""
    if tokenizer is None and doc_length is not None:
        raise ValueError(
            "Please select from one of the valid options for the `tokenizer` parameter: \n"
            "{'char', 'whitespace', 'vectorizer'} \n"
            "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
        )
    elif tokenizer is not None and doc_length is None:
        raise ValueError("If `tokenizer` is provided, `doc_length` of type int must be provided as well.")


def retry_with_exponential_backoff(
    func,
    initial_delay: float = 1,
    exponential_base: float = 2,
    jitter: bool = True,
    max_retries: int = 10,
    errors: tuple = None,
):
    """Retry a function with exponential backoff."""

    def wrapper(*args, **kwargs):
        # Initialize variables
        num_retries = 0
        delay = initial_delay

        # Loop until a successful response or max_retries is hit or an exception is raised
        while True:
            try:
                return func(*args, **kwargs)

            # Retry on specific errors
            except errors:
                # Increment retries
                num_retries += 1

                # Check if max retries has been reached
                if num_retries > max_retries:
                    raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")

                # Increment the delay
                delay *= exponential_base * (1 + jitter * random.random())

                # Sleep for the delay
                time.sleep(delay)

            # Raise exceptions for any errors not specified
            except Exception as e:
                raise e

    return wrapper