test_utils.py 3.24 KB
import pytest
import logging
import numpy as np
from typing import List
from bertopic._utils import (
    check_documents_type,
    check_embeddings_shape,
    MyLogger,
    select_topic_representation,
    get_unique_distances,
)
from scipy.sparse import csr_matrix


def test_logger():
    logger = MyLogger()
    logger.configure("DEBUG")
    assert isinstance(logger.logger, logging.Logger)
    assert logger.logger.level == 10

    logger = MyLogger()
    logger.configure("WARNING")
    assert isinstance(logger.logger, logging.Logger)
    assert logger.logger.level == 30


@pytest.mark.parametrize(
    "docs",
    ["A document not in an iterable", [None], 5],
)
def test_check_documents_type(docs):
    with pytest.raises(TypeError):
        check_documents_type(docs)


def test_check_embeddings_shape():
    docs = ["doc_one", "doc_two"]
    embeddings = np.array([[1, 2, 3], [2, 3, 4]])
    check_embeddings_shape(embeddings, docs)


def test_make_unique_distances():
    def check_dists(dists: List[float], noise_max: float):
        unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
        assert len(unique_dists) == len(dists), "The number of elements must be the same"
        assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"

    check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)

    # testing whether the distances are sorted in ascending order when if the noise is extremely high
    check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20)

    # test whether the distances are sorted in ascending order when the distances are all the same
    check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7)


def test_select_topic_representation():
    ctfidf_embeddings = np.array([[1, 1, 1]])
    ctfidf_embeddings_sparse = csr_matrix(
        (ctfidf_embeddings.reshape(-1).tolist(), ([0, 0, 0], [0, 1, 2])),
        shape=ctfidf_embeddings.shape,
    )
    topic_embeddings = np.array([[2, 2, 2]])

    # Use topic embeddings
    repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=False)
    np.testing.assert_array_equal(topic_embeddings, repr_)
    assert not ctfidf_used

    # Fallback to c-TF-IDF
    repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, None, use_ctfidf=False)
    np.testing.assert_array_equal(ctfidf_embeddings, repr_)
    assert ctfidf_used

    # Use c-TF-IDF
    repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=True)
    np.testing.assert_array_equal(ctfidf_embeddings, repr_)
    assert ctfidf_used

    # Fallback to topic embeddings
    repr_, ctfidf_used = select_topic_representation(None, topic_embeddings, use_ctfidf=True)
    np.testing.assert_array_equal(topic_embeddings, repr_)
    assert not ctfidf_used

    # `scipy.sparse.csr_matrix` can be used as c-TF-IDF embeddings
    np.testing.assert_array_equal(
        ctfidf_embeddings,
        select_topic_representation(ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True)[0],
    )

    # check that `csr_matrix` is not casted to `np.ndarray` when `ctfidf_as_ndarray` is False
    repr_ = select_topic_representation(ctfidf_embeddings_sparse, None, output_ndarray=False)[0]

    assert isinstance(repr_, csr_matrix)