TopicRepresentation.py
33.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
import numpy as np
import umap
import sys
import os
import inspect
from tqdm import tqdm
import umap
import json
# make sure the import works even if the package has not been installed and just the files are used
from topicgpt.Clustering import Clustering_and_DimRed
from topicgpt.ExtractTopWords import ExtractTopWords
from topicgpt.TopwordEnhancement import TopwordEnhancement
class Topic:
"""
class to represent a topic and all its attributes
"""
def __init__(self,
topic_idx: str,
documents: list[str],
words: dict[str, int],
centroid_hd: np.ndarray = None,
centroid_ld: np.ndarray = None,
document_embeddings_hd: np.ndarray = None,
document_embeddings_ld: np.ndarray = None,
document_embedding_similarity: np.ndarray = None,
umap_mapper: umap.UMAP = None,
top_words: dict[str, list[str]] = None,
top_word_scores: dict[str, list[float]] = None
) -> None:
"""
Represents a topic and all its attributes.
Args:
topic_idx (str): Index or name of the topic.
documents (list[str]): List of documents in the topic.
words (dict[str, int]): Dictionary of words and their counts in the topic.
centroid_hd (np.ndarray, optional): Centroid of the topic in high-dimensional space.
centroid_ld (np.ndarray, optional): Centroid of the topic in low-dimensional space.
document_embeddings_hd (np.ndarray, optional): Embeddings of documents in high-dimensional space that belong to this topic.
document_embeddings_ld (np.ndarray, optional): Embeddings of documents in low-dimensional space that belong to this topic.
document_embedding_similarity (np.ndarray, optional): Similarity array of document embeddings to the centroid in low-dimensional space.
umap_mapper (umap.UMAP, optional): UMAP mapper object to map from high-dimensional space to low-dimensional space.
top_words (dict[str, list[str]], optional): Dictionary of top words in the topic according to different metrics.
top_word_scores (dict[str, list[float]], optional): Dictionary of how representative the top words are according to different metrics.
"""
# do some checks on the input
assert len(documents) == len(document_embeddings_hd) == len(document_embeddings_ld) == len(document_embedding_similarity), "documents, document_embeddings_hd, document_embeddings_ld and document_embedding_similarity must have the same length"
assert len(documents) > 0, "documents must not be empty"
assert len(words) > 0, "words must not be empty"
self.topic_idx = topic_idx
self.documents = documents
self.words = words
self.centroid_hd = centroid_hd
self.centroid_ld = centroid_ld
self.document_embeddings_hd = document_embeddings_hd
self.document_embeddings_ld = document_embeddings_ld
self.document_embedding_similarity = document_embedding_similarity
self.umap_mapper = umap_mapper
self.top_words = top_words
self.top_word_scores = top_word_scores
self.topic_name = None # initialize the name of the topic as none
def __str__(self) -> str:
if self.topic_idx and self.topic_name is None:
repr = f"Topic {hash(self)}\n"
if self.topic_name is None:
repr = f"Topic: {self.topic_idx}\n"
else:
repr = f"Topic {self.topic_idx}: {self.topic_name}\n"
return repr
def __repr__(self) -> str:
return self.__str__()
def to_json(self) -> str:
"""
return a json representation of the topic
"""
repr_dict = {
"topic_idx": self.topic_idx,
"topic_name": self.topic_name,
"topic_description": self.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
def to_dict(self) -> dict:
"""
return a dict representation of the topic
"""
repr_dict = {
"topic_idx": int(self.topic_idx),
"topic_name": self.topic_name,
"topic_description": self.topic_description
}
return repr_dict
def set_topic_name(self, name:str):
"""
add a name to the topic
params:
name: name of the topic
"""
self.topic_name = name
def set_topic_description(self, text: str):
"""
add a text description to the topic
params:
text: text description of the topic
"""
self.topic_description = text
def topic_to_json(topic: Topic) -> str:
"""
Return a JSON representation of the topic.
Args:
topic (Topic): The topic object to convert to JSON.
Returns:
str: A JSON string representing the topic.
"""
repr_dict = {
"topic_idx": topic.topic_idx,
"topic_name": topic.topic_name,
"topic_description": topic.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
def topic_lis_to_json(topics: list[Topic]) -> str:
"""
Return a JSON representation of a list of topics.
Args:
topics (list[Topic]): The list of topic objects to convert to JSON.
Returns:
str: A JSON string representing the list of topics.
"""
repr_dict = {}
for topic in topics:
repr_dict[topic.topic_idx] = {
"topic_name": topic.topic_name,
"topic_description": topic.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
@staticmethod
def extract_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
Args:
corpus (list[str]): List of documents.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
label_mapping[-1] = -1
labels = np.array([label_mapping[label] for label in labels])
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
vocab = extractor.compute_corpus_vocab(corpus, **compute_vocab_hyperparams) # compute the vocabulary of the corpus
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings[labels == label]
embeddings_ld = dim_red_embeddings[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_topics_no_new_vocab_computation(corpus: list[str], vocab: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], consider_outliers: bool = False) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
This version does not compute the vocabulary of the corpus and instead uses the provided vocabulary.
Args:
corpus (list[str]): List of documents.
vocab (list[str]): Vocabulary of the corpus.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
consider_outliers (bool, optional): Whether to consider outliers during topic extraction (default is False).
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
label_mapping[-1] = -1
labels = np.array([label_mapping[label] for label in labels])
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = consider_outliers) # compute the word-topic matrix of the corpus
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = True)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings[labels == label]
embeddings_ld = dim_red_embeddings[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
try:
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
except:
pass
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_and_describe_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, enhancer: TopwordEnhancement, n_topwords: int = 2000, n_topwords_description: int = 500, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}, topword_description_method: str = "cosine_similarity") -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings and describes/names them using the given enhancer object.
Args:
corpus (list[str]): List of documents.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
enhancer (TopwordEnhancement): Enhancer object for enhancing top-words and generating descriptions/names for topics.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for description and naming (default is 500).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
topword_description_method (str, optional): Method to use for top-word extraction for description/naming.
Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
Returns:
list[Topic]: List of Topic objects representing the extracted and described topics.
"""
print("Extracting topics...")
topics = extract_topics(corpus, document_embeddings, clusterer, vocab_embeddings, n_topwords, topword_extraction_methods, compute_vocab_hyperparams)
print("Describing topics...")
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
return topics
@staticmethod
def extract_topics_labels_vocab(corpus: list[str], document_embeddings_hd: np.ndarray, document_embeddings_ld: np.ndarray, labels: np.ndarray, umap_mapper: umap.UMAP, vocab_embeddings: np.ndarray, vocab: list[str] = None, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"]) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
Args:
corpus (list[str]): List of documents.
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
labels (np.ndarray): Labels indicating the topics.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
vocab (list[str], optional): Vocabulary of the corpus (default is None).
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
if vocab is None:
extractor = ExtractTopWords()
vocab = extractor.compute_corpus_vocab(corpus) # compute the vocabulary of the corpus
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings_hd, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings_hd[labels == label]
embeddings_ld = document_embeddings_ld[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_describe_topics_labels_vocab(
corpus: list[str],
document_embeddings_hd: np.ndarray,
document_embeddings_ld: np.ndarray,
labels: np.ndarray,
umap_mapper: umap.UMAP,
vocab_embeddings: np.ndarray,
enhancer: TopwordEnhancement,
vocab: list[str] = None,
n_topwords: int = 2000,
n_topwords_description: int = 500,
topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
topword_description_method: str = "cosine_similarity"
) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
Describe and name the topics with the given enhancer object.
Args:
corpus (list[str]): List of documents.
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
labels (np.ndarray): Labels indicating the topics.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
vocab (list[str], optional): Vocabulary of the corpus (default is None).
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
topword_description_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
topics = extract_topics_labels_vocab(corpus, document_embeddings_hd, document_embeddings_ld, labels, umap_mapper, vocab_embeddings, vocab, n_topwords, topword_extraction_methods)
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
return topics
@staticmethod
def extract_topic_cos_sim(
documents_topic: list[str],
document_embeddings_topic: np.ndarray,
words_topic: list[str],
vocab_embeddings: dict,
umap_mapper: umap.UMAP,
n_topwords: int = 2000
) -> Topic:
"""
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
Only uses cosine-similarity for top-word extraction.
Args:
documents_topic (list[str]): List of documents in the topic.
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
words_topic (list[str]): List of words in the topic.
vocab_embeddings (dict): Embeddings of the vocabulary.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
Returns:
Topic: Topic object representing the extracted topic.
"""
topword_extraction_methods = ["cosine_similarity"]
extractor = ExtractTopWords()
centroid_hd = extractor.extract_centroid(document_embeddings_topic)
centroid_ld = umap_mapper.transform(centroid_hd.reshape(1, -1))[0]
labels = np.zeros(len(documents_topic), dtype = int) #everything has label 0
word_topic_mat = extractor.compute_word_topic_mat(documents_topic, words_topic, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = words_topic, vocab_embedding_dict = vocab_embeddings, centroid_dict= {0: centroid_ld}, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
top_words = {
"cosine_similarity": cosine_topwords if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"cosine_similarity": cosine_dict if "cosine_similarity" in topword_extraction_methods else None
}
document_embeddings_hd = document_embeddings_topic
document_embeddings_ld = umap_mapper.transform(document_embeddings_hd)
document_embedding_similarity = np.dot(document_embeddings_ld, centroid_ld)/(np.linalg.norm(document_embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) # is this correct???
topic = Topic(topic_idx = None,
documents = documents_topic,
words = words_topic,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = document_embeddings_hd,
document_embeddings_ld = document_embeddings_ld,
document_embedding_similarity = document_embedding_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
return topic
@staticmethod
def extract_and_describe_topic_cos_sim(
documents_topic: list[str],
document_embeddings_topic: np.ndarray,
words_topic: list[str],
vocab_embeddings: dict,
umap_mapper: umap.UMAP,
enhancer: TopwordEnhancement,
n_topwords: int = 2000,
n_topwords_description=500
) -> Topic:
"""
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
Only use cosine-similarity for top-word extraction.
Describe and name the topic with the given enhancer object.
Args:
documents_topic (list[str]): List of documents in the topic.
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
words_topic (list[str]): List of words in the topic.
vocab_embeddings (dict): Embeddings of the vocabulary.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
Returns:
Topic: Topic object representing the extracted and described topic.
"""
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
return topic
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
return topic
@staticmethod
def describe_and_name_topics(
topics: list[Topic],
enhancer: TopwordEnhancement,
topword_method="tfidf",
n_words=500
) -> list[Topic]:
"""
Describe and name the topics using the OpenAI API with the given enhancer object.
Args:
topics (list[Topic]): List of Topic objects.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
topword_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "tfidf").
n_words (int, optional): Number of topwords to extract for the description and the name (default is 500).
Returns:
list[Topic]: List of Topic objects with the description and name added.
"""
if topword_method not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_method can only be 'tfidf' or 'cosine_similarity'")
for topic in tqdm(topics):
tws = topic.top_words[topword_method]
try:
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
except Exception as e:
print(f"Error in topic {topic.topic_idx}: {e}")
print("Trying again...")
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
topic.set_topic_name(topic_name)
topic.set_topic_description(topic_description)
return topics