-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeneralized_analysis_of_text_data.py
1102 lines (825 loc) · 43.1 KB
/
generalized_analysis_of_text_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""Generalized Analysis of Text Data
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1N2jR0dGg_8lcRZan9QpqI60o0hchnOIL
# Generalized Analysis of Text Data
Written by [Ken Reid](https://github.com/DrKenReid).
## Intro
We will explore various techniques for analyzing text data regardless of context. The workflow is designed to inspire other projects, or to be a reference guide for certain techniques. The goal is to leverage AI to gain deeper insights from textual data, such as identifying themes, detecting sentiment, and recognizing named entities.
To run this notebook, click "Runtime" in the menu at the top, then select "Run all".
##Table of Contents
1. Setup
2. Data Collection
3. Dataset Building
4. Initial Textual Analysis
5. Exploratory Data Analysis
6. AI-Enhanced Insights
6.1 Topic Modeling
6.2 Text Clustering
6.3 Word Embeddings
6.4 Document Similarity
6.5 Named Entity Recognition
6.6 Topic Network Visualization
6.7 Sentiment Analysis
6.8 Text Classification
6.9 Text Summarization
6.10 Part-of-Speech (POS) Tagging
6.11 Dependency Parsing
6.12 Topic Coherence
Conclusion and Next Steps
## 1. Setup
First, let's import the necessary libraries. These libraries provide the tools we need for data manipulation, analysis, and visualization:
"""
# Importing essential libraries for data handling and analysis
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For creating visualizations
import seaborn as sns # For statistical data visualization
# Importing libraries specifically for textual data processing and AI techniques
import nltk # Natural Language Toolkit, useful for text processing
from nltk.corpus import stopwords # For filtering out common words
from nltk.tokenize import word_tokenize # For splitting text into words
from nltk.stem import WordNetLemmatizer # For reducing words to their base form
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import spacy # For advanced NLP tasks like Named Entity Recognition
from textblob import TextBlob # For sentiment analysis and other NLP tasks
# Machine Learning libraries
from sklearn.datasets import fetch_20newsgroups # For fetching the 20 Newsgroups dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # For converting text to numerical data
from sklearn.decomposition import LatentDirichletAllocation, PCA # For topic modeling and dimensionality reduction
from sklearn.cluster import KMeans # For text clustering
from sklearn.manifold import TSNE # For visualizing high-dimensional data
from sklearn.metrics.pairwise import cosine_similarity # For document similarity
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.linear_model import LogisticRegression # For text classification
from sklearn.metrics import accuracy_score, classification_report # For evaluating classification models
# Word Embeddings
from gensim.models import Word2Vec # For creating word embeddings
# Network Analysis
import networkx as nx # For creating network visualizations
# Text Summarization
from transformers import pipeline # For text summarization using pre-trained models
# Topic Coherence
from gensim.corpora import Dictionary # For creating a dictionary representation of a corpus
from gensim.models.coherencemodel import CoherenceModel # For calculating topic coherence
# Other utilities
from typing import List, Tuple, Dict # For type hinting
from collections import Counter, defaultdict # For counting and creating default dictionaries
import re # For regular expressions
# Ensure the necessary resources for NLTK are downloaded
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import requests
# Fetch additional stopwords
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
response = requests.get(url)
additional_stopwords = set(response.text.split())
# Combine NLTK stopwords with additional stopwords
stop_words = set(stopwords.words('english')).union(additional_stopwords)
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")
# Set visualization styles
sns.set(style="whitegrid")
# Install required libraries
!pip install pandas numpy matplotlib seaborn nltk spacy textblob scikit-learn gensim networkx transformers > /dev/null 2>&1 || echo "Error occurred during installation"
# Download the English language model for spaCy
!python -m spacy download en_core_web_sm > /dev/null 2>&1 || echo "Error occurred during download"
print("Initialization completed.")
"""## 2. Data Collection
In this section, we'll use the 20 Newsgroups dataset, a popular dataset for text classification tasks in machine learning. This dataset contains approximately 20,000 newsgroup documents, organized into 20 categories.
"""
def collect_data(categories: List[str] = None) -> Tuple[List[str], List[int], List[str]]:
"""
Fetch data from the 20 Newsgroups dataset.
Args:
categories (List[str], optional): List of categories to fetch. If None, all categories are fetched.
Default is a selection of four categories.
Returns:
Tuple[List[str], List[int], List[str]]: A tuple containing the list of documents,
their corresponding category indices,
and the category names.
"""
if categories is None:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
# Fetch the dataset
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
# Return the data, target labels, and target names
return newsgroups.data, newsgroups.target, newsgroups.target_names
# Collect data
documents, categories, category_names = collect_data()
print(f"Collected {len(documents)} documents across {len(category_names)} categories.")
print(f"Categories: {', '.join(category_names)}")
print("Data collection complete.")
"""## 3. Dataset Building
Now that we have our raw data, let's build a structured dataset using pandas. This will help us organize the text documents along with their metadata for easier analysis.
"""
import pandas as pd # Import pandas for data manipulation
def build_dataset(documents: List[str], categories: List[int], category_names: List[str]) -> pd.DataFrame:
"""
Build a pandas DataFrame from the collected documents.
Args:
documents (List[str]): List of text documents.
categories (List[int]): List of category indices for each document.
category_names (List[str]): List of category names.
Returns:
pd.DataFrame: A DataFrame containing the documents and their metadata.
"""
# Create a DataFrame with the text, category names, document length, and word count
df = pd.DataFrame({
'text': documents,
'category': [category_names[cat] for cat in categories],
'length': [len(doc) for doc in documents],
'word_count': [len(doc.split()) for doc in documents]
})
return df
# Build the dataset
df = build_dataset(documents, categories, category_names)
print("*************Example of the data:*************")
print(df.head()) # Display the first few rows of the DataFrame
print("\n*************Shape of the data:*************")
print(f"\nDataset shape: {df.shape}") # Print the shape of the DataFrame
print("\n*************Category distribution of the data:*************")
print(f"\nCategory distribution:\n{df['category'].value_counts()}") # Print the distribution of categories
print("\n*************Summary statistics of the data:*************")
print(f"\nSummary statistics:\n{df.describe()}") # Print summary statistics
print("\n\nDataset building complete.")
"""## 4. Initial Textual Analysis
Let's perform some initial analysis on our text data, including word frequency and basic statistics. This will help us understand the common words used and some basic characteristics of our dataset.
"""
def analyze_text(df: pd.DataFrame) -> Tuple[Counter, Dict[str, float]]:
"""
Perform initial textual analysis on the dataset.
Args:
df (pd.DataFrame): The dataset containing text documents.
Returns:
Tuple[Counter, Dict[str, float]]: A tuple containing word frequency counter and text statistics.
"""
# Combine all text documents into one large text
all_text = ' '.join(df['text'])
# Tokenize text and filter out stopwords, non-alphanumeric words, and integers
words = [word.lower() for word in word_tokenize(all_text)
if word.isalnum() and word.lower() not in stop_words and not word.isdigit()]
# Count the frequency of each word
word_freq = Counter(words)
# Calculate basic statistics of the dataset
stats = {
'avg_document_length': df['length'].mean(), # Average length of documents
'avg_word_count': df['word_count'].mean(), # Average word count per document
'unique_words': len(set(words)) # Number of unique words in the dataset
}
return word_freq, stats
# Perform initial analysis
word_freq, text_stats = analyze_text(df)
# Display the results
print("Top 10 most common words, excluding stopwords:")
print(word_freq.most_common(10))
print("\nText statistics:")
for key, value in text_stats.items():
print(f"{key}: {value:.2f}")
"""## 5. Exploratory Data Analysis
Let's visualize some of our findings to gain insights into our data. We'll use a colorblind-friendly palette and modern Seaborn styling.
"""
def set_plot_style():
"""
Set up the plotting style for better visuals and colorblind-friendliness.
"""
# Use a colorblind-friendly palette
colors = ['#0173B2', '#DE8F05', '#029E73', '#D55E00', '#CC78BC', '#CA9161', '#FBAFE4', '#949494', '#ECE133', '#56B4E9']
sns.set_palette(sns.color_palette(colors))
# Set the style to a cleaner, more modern look
sns.set_style("whitegrid")
# Increase font sizes for better readability
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
def visualize_data(df: pd.DataFrame, word_freq: Counter):
"""
Create visualizations for exploratory data analysis.
Args:
df (pd.DataFrame): The dataset containing text documents and their metadata.
word_freq (Counter): Word frequency counter from the initial textual analysis.
"""
# Apply the plotting style
set_plot_style()
# Plot 1: Word frequency distribution
plt.figure(figsize=(14, 7))
words, counts = zip(*word_freq.most_common(20))
sns.barplot(x=list(words), y=list(counts), color=sns.color_palette()[0])
plt.title('Top 20 Most Frequent Words')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Plot 2: Word count distribution by category
plt.figure(figsize=(12, 7))
sns.boxplot(x='category', y='word_count', data=df)
plt.title('Word Count Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Word Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Create visualizations
visualize_data(df, word_freq)
"""## 6. AI-Enhanced Insights
In this section, we'll explore various techniques that can be loosely categorized as "AI", in order to gain deeper insights into our textual data. These methods can help uncover patterns, themes, and relationships that might not be immediately apparent through traditional analysis.
### 6.1 Topic Modeling
This cell performs topic modeling on our dataset and provides two visualizations:
1. Bar plots showing the top words for each identified topic.
2. An area plot showing the distribution of topics across the first 100 documents in the dataset.
These visualizations aid in understanding the main themes in text data and how these themes are distributed across documents.
"""
def perform_topic_modeling(df: pd.DataFrame, num_topics: int = 6) -> Tuple[List[List[str]], List[List[float]]]:
"""
Perform topic modeling using Latent Dirichlet Allocation (LDA).
Args:
df (pd.DataFrame): The dataset containing text documents.
num_topics (int): Number of topics to extract.
Returns:
Tuple[List[List[str]], List[List[float]]]: A tuple containing the top words for each topic
and the topic distribution for each document.
"""
# Convert text documents to a TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text'])
# Fit the LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf_matrix)
# Extract the top words for each topic
feature_names = vectorizer.get_feature_names_out()
top_words = []
for topic_idx, topic in enumerate(lda.components_):
top_words_idx = topic.argsort()[:-11:-1] # Get indices of top words for the topic
top_words.append([feature_names[i] for i in top_words_idx])
# Calculate the topic distribution for each document
doc_topic_dist = lda.transform(tfidf_matrix)
return top_words, doc_topic_dist.tolist()
# Perform topic modeling
top_words, doc_topic_dist = perform_topic_modeling(df)
# Print topics and their top words
print("Topics and their top words:")
for i, topic_words in enumerate(top_words):
print(f"Topic {i+1}: {', '.join(topic_words)}")
# Visualize top words for each topic
fig, axes = plt.subplots(2, 3, figsize=(20, 10))
axes = axes.flatten()
for i, (topic_words, ax) in enumerate(zip(top_words, axes)):
words = topic_words[:10] # Take top 10 words
word_importance = range(len(words), 0, -1) # Assign importance based on order
ax.barh(words, word_importance, color=sns.color_palette()[i % len(sns.color_palette())])
ax.set_title(f'Topic {i+1} Top Words')
ax.invert_yaxis() # Invert y-axis to show most important words on top
plt.tight_layout()
plt.show()
# Visualize topic distribution across documents
topic_names = [f'Topic {i+1}' for i in range(len(top_words))]
doc_topic_df = pd.DataFrame(doc_topic_dist, columns=topic_names)
plt.figure(figsize=(12, 6))
doc_topic_df.iloc[:100].plot(kind='area', stacked=True, colormap='tab20')
plt.title('Topic Distribution in First 100 Documents')
plt.xlabel('Document')
plt.ylabel('Topic Probability')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
"""### 6.2 Text Clustering
Clustering is a powerful technique that allows us to group similar documents together based on their content. This can reveal hidden patterns and themes within large collections of text, providing insights into how different topics or sentiments are distributed across the data.
"""
def preprocess_text(text: str) -> str:
"""
Preprocess the input text by converting to lowercase, removing special characters,
tokenizing, removing stopwords, and lemmatizing.
Args:
text (str): The input text document.
Returns:
str: The preprocessed text.
"""
text = text.lower() # Convert to lowercase
text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters and numbers
tokens = word_tokenize(text) # Tokenize
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and len(token) > 2] # Remove stopwords and short words
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatize
return ' '.join(tokens)
def perform_text_clustering(df: pd.DataFrame, n_clusters: int = 5) -> Tuple[List[int], np.ndarray]:
"""
Perform text clustering using K-means.
Args:
df (pd.DataFrame): The dataset containing text documents.
n_clusters (int): Number of clusters to create.
Returns:
Tuple[List[int], np.ndarray]: Cluster assignments for each document and the TF-IDF matrix.
"""
df['processed_text'] = df['text'].apply(preprocess_text) # Preprocess the text
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['processed_text']) # Create TF-IDF matrix
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42) # Initialize KMeans
cluster_assignments = kmeans.fit_predict(tfidf_matrix) # Fit KMeans and get cluster assignments
return cluster_assignments.tolist(), tfidf_matrix
# Perform text clustering
cluster_assignments, tfidf_matrix = perform_text_clustering(df)
df['cluster'] = cluster_assignments # Add cluster assignments to the DataFrame
# Print cluster distribution
print("Cluster distribution:")
print(df['cluster'].value_counts().sort_index())
# Visualize clusters
pca = PCA(n_components=2)
coords = pca.fit_transform(tfidf_matrix.toarray()) # Reduce dimensionality for visualization
plt.figure(figsize=(12, 8))
scatter = plt.scatter(coords[:, 0], coords[:, 1], c=df['cluster'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter)
plt.title('Document Clusters Visualization')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.tight_layout()
plt.show()
# Analyze clusters
for cluster in range(5):
print(f"\nTop words in Cluster {cluster}:")
cluster_docs = df[df['cluster'] == cluster]['processed_text']
vectorizer = TfidfVectorizer(max_features=10, stop_words='english')
cluster_tfidf = vectorizer.fit_transform(cluster_docs)
top_words = vectorizer.get_feature_names_out()
print(', '.join(top_words))
"""This plot shows how a computer has grouped similar documents together based on their content. Each dot represents a document, and documents that are close to each other are more similar. The different colors show different groups or topics the computer has identified.
You can see some clear groupings (like the green and teal clusters), which might represent distinct topics in your data. The overlapping areas (like between yellow and blue) could show documents that discuss multiple related topics.
This visualization helps you quickly see patterns in your data that might be hard to spot when reading through hundreds of documents manually. It's a starting point for understanding the main themes in your dataset and how they relate to each other.
### 6.3 Word Embeddings
Word embeddings are a powerful way to represent text data. They capture semantic meanings of words by mapping them to vectors in a continuous space, where similar words are close to each other. This uses a technique called t-SNE (t-distributed stochastic neighbor embedding), a statistical method for visualizing high-dimensional data by giving each datapoint a location in a two or three-dimensional map.
"""
def create_word_embeddings(df: pd.DataFrame) -> Word2Vec:
"""
Create word embeddings using Word2Vec.
Args:
df (pd.DataFrame): The dataset containing text documents.
Returns:
Word2Vec: Trained Word2Vec model.
"""
# Ensure we have processed text
if 'processed_text' not in df.columns:
df['processed_text'] = df['text'].apply(preprocess_text)
# Tokenize the processed text
tokenized_texts = [text.split() for text in df['processed_text']]
# Train Word2Vec model
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=5, workers=4)
return model
# Create word embeddings
word2vec_model = create_word_embeddings(df)
# Find similar words
print("Words similar to 'space':")
print(word2vec_model.wv.most_similar('space', topn=10))
print("\nWords similar to 'computer':")
print(word2vec_model.wv.most_similar('computer', topn=10))
# Visualize word embeddings
def plot_word_embeddings(model, words):
"""
Plot word embeddings using t-SNE for dimensionality reduction.
Args:
model (Word2Vec): Trained Word2Vec model.
words (List[str]): List of words to visualize.
"""
# Extract word vectors
word_vectors = np.array([model.wv[word] for word in words])
# Reduce dimensions using t-SNE
tsne = TSNE(n_components=2, random_state=42)
word_vectors_2d = tsne.fit_transform(word_vectors)
# Plot
plt.figure(figsize=(12, 8))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1], c='blue', alpha=0.5)
for i, word in enumerate(words):
plt.annotate(word, xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]))
plt.title('Word Embeddings Visualization')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.tight_layout()
plt.show()
# Select top 50 most frequent words for visualization
word_freq = {word: word2vec_model.wv.get_vecattr(word, "count") for word in word2vec_model.wv.key_to_index}
top_words = sorted(word_freq, key=word_freq.get, reverse=True)[:50]
# Plot word embeddings
plot_word_embeddings(word2vec_model, top_words)
"""This plot shows how words relate to each other based on how they're used in your documents. Each word is represented by a dot, and words that are close together on the plot are used in similar ways or contexts in your data.
For example, you can see 'jesus' and 'god' are very close together, which makes sense as they often appear in similar contexts. Similarly, 'file', 'image', and 'jpeg' are clustered, likely because they're often used together when discussing digital images.
This visualization helps you quickly see patterns in language use across your documents. It can reveal themes, associations, or even biases in your data that might not be obvious from reading.
The technical term for this is "word embeddings," where complex relationships between words are transformed into positions in a two-dimensional space. It's a powerful tool for analyzing large amounts of text data in a way that captures subtle patterns of language use.
### 6.4 Document Similarity
Understanding the similarity between documents can reveal how themes and topics overlap across different texts. This is useful for identifying closely related documents or clusters of documents with similar content.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def compute_document_similarity(df: pd.DataFrame) -> np.ndarray:
"""
Compute document similarity using TF-IDF and cosine similarity.
Args:
df (pd.DataFrame): The dataset containing text documents.
Returns:
np.ndarray: Similarity matrix for documents.
"""
# Ensure we have processed text
if 'processed_text' not in df.columns:
df['processed_text'] = df['text'].apply(preprocess_text)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
# Compute document similarity
similarity_matrix = compute_document_similarity(df)
# Visualize similarity matrix
plt.figure(figsize=(12, 10))
sns.heatmap(similarity_matrix[:100, :100], cmap='viridis')
plt.title('Document Similarity Heatmap (First 100 Documents)')
plt.xlabel('Document Index')
plt.ylabel('Document Index')
plt.show()
# Find most similar document pairs
def find_most_similar_pairs(similarity_matrix: np.ndarray, top_n: int = 5) -> List[Tuple[int, int, float]]:
"""
Find the most similar document pairs.
Args:
similarity_matrix (np.ndarray): The document similarity matrix.
top_n (int): Number of top pairs to return.
Returns:
List[Tuple[int, int, float]]: List of (doc1_index, doc2_index, similarity_score) tuples.
"""
# Set diagonal to -1 to exclude self-similarity
np.fill_diagonal(similarity_matrix, -1)
# Find indices of top similar pairs
indices = np.unravel_index(np.argsort(similarity_matrix, axis=None)[-top_n:], similarity_matrix.shape)
# Create list of (doc1_index, doc2_index, similarity_score) tuples
similar_pairs = [(indices[0][i], indices[1][i], similarity_matrix[indices[0][i], indices[1][i]])
for i in range(top_n)]
return similar_pairs[::-1] # Reverse to get highest similarity first
# Find and print most similar document pairs
most_similar_pairs = find_most_similar_pairs(similarity_matrix)
print("Most similar document pairs:")
for i, (doc1, doc2, score) in enumerate(most_similar_pairs, 1):
print(f"{i}. Documents {doc1} and {doc2}: Similarity score = {score:.4f}")
print(f" Doc {doc1}: {df['text'].iloc[doc1][:100]}...")
print(f" Doc {doc2}: {df['text'].iloc[doc2][:100]}...")
print()
"""This heatmap shows how similar each document is to every other document in your dataset (for the first 100 documents). It's like a visual comparison chart for your texts.
Each square represents the similarity between two documents. The brighter the color (towards yellow), the more similar the documents are. The dark purple squares show documents that are very different from each other.
The bright yellow diagonal line from top-left to bottom-right shows each document compared to itself, which is why it's the brightest (100% similar).
If you see bright squares off the diagonal, it might indicate groups of documents that are very similar – perhaps discussing the same topic or written in a similar style.
This visualization can help you spot trends, outliers, or groups in your dataset without having to read through all the documents manually. It's particularly helpful when dealing with large amounts of text data, allowing you to see relationships that might not be obvious when reading the texts one by one.
### 6.5 Named Entity Recognition
Named Entity Recognition (NER) is a technique used to identify and classify named entities in text, such as people, organizations, locations, dates, and more. This can help extract structured information from unstructured text, making it easier to analyze and understand the data.
"""
import spacy
from collections import Counter
import matplotlib.pyplot as plt
# Load the English NER model
nlp = spacy.load("en_core_web_sm")
def perform_ner(df: pd.DataFrame, sample_size: int = 1000) -> Dict[str, Counter]:
"""
Perform Named Entity Recognition on a sample of documents.
Args:
df (pd.DataFrame): The dataset containing text documents.
sample_size (int): Number of documents to sample for NER.
Returns:
Dict[str, Counter]: Dictionary of entity types and their frequency counts.
"""
# Sample documents if the dataset is large
if len(df) > sample_size:
sampled_docs = df['text'].sample(n=sample_size, random_state=42)
else:
sampled_docs = df['text']
entity_counts = {}
for doc in nlp.pipe(sampled_docs, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
for ent in doc.ents:
if ent.label_ not in entity_counts:
entity_counts[ent.label_] = Counter()
entity_counts[ent.label_][ent.text] += 1
return entity_counts
# Perform NER
entity_counts = perform_ner(df)
# Print top entities for each type
print("Top entities by type:")
for entity_type, counts in entity_counts.items():
print(f"\n{entity_type}:")
for entity, count in counts.most_common(5):
print(f" {entity}: {count}")
# Visualize entity type distribution
entity_type_counts = {etype: sum(counts.values()) for etype, counts in entity_counts.items()}
sorted_types = sorted(entity_type_counts.items(), key=lambda x: x[1], reverse=True)
plt.figure(figsize=(12, 6))
plt.bar([t[0] for t in sorted_types], [t[1] for t in sorted_types])
plt.title('Distribution of Named Entity Types')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Visualize top entities across all types
all_entities = Counter()
for counts in entity_counts.values():
all_entities.update(counts)
top_entities = all_entities.most_common(20)
plt.figure(figsize=(12, 6))
plt.bar([e[0] for e in top_entities], [e[1] for e in top_entities])
plt.title('Top 20 Named Entities Across All Types')
plt.xlabel('Entity')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
"""### 6.6 Topic Network Visualization
Topic network visualization helps in understanding the relationships between topics and the words that define them. Creating a network graph helps in exploring the connections between different topics and their key terms.
"""
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
def perform_topic_modeling(df: pd.DataFrame, num_topics: int = 5, num_words: int = 10) -> List[List[str]]:
"""
Perform topic modeling using Latent Dirichlet Allocation.
Args:
df (pd.DataFrame): The dataset containing text documents.
num_topics (int): Number of topics to extract.
num_words (int): Number of top words to include for each topic.
Returns:
List[List[str]]: A list of topics, where each topic is a list of top words.
"""
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf_matrix)
feature_names = vectorizer.get_feature_names_out()
top_words = []
for topic_idx, topic in enumerate(lda.components_):
top_words_idx = topic.argsort()[:-num_words - 1:-1]
top_words.append([feature_names[i] for i in top_words_idx])
return top_words
def visualize_topic_network(topic_words: List[List[str]]):
"""
Visualize topics as a network graph.
Args:
topic_words (List[List[str]]): List of top words for each topic.
"""
G = nx.Graph()
for i, topic in enumerate(topic_words):
topic_node = f"Topic {i+1}"
G.add_node(topic_node, node_type='topic')
for word in topic:
G.add_node(word, node_type='word')
G.add_edge(topic_node, word)
plt.figure(figsize=(20, 20))
pos = nx.spring_layout(G, k=0.5, iterations=50)
# Draw topic nodes
topic_nodes = [node for node, data in G.nodes(data=True) if data['node_type'] == 'topic']
nx.draw_networkx_nodes(G, pos, nodelist=topic_nodes, node_color='lightblue', node_size=3000, alpha=0.8)
# Draw word nodes
word_nodes = [node for node, data in G.nodes(data=True) if data['node_type'] == 'word']
nx.draw_networkx_nodes(G, pos, nodelist=word_nodes, node_color='lightgreen', node_size=2000, alpha=0.6)
# Draw edges
nx.draw_networkx_edges(G, pos, width=1, alpha=0.5)
# Add labels
nx.draw_networkx_labels(G, pos, font_size=10, font_weight="bold")
plt.title("Topic-Word Network", fontsize=20)
plt.axis('off')
plt.tight_layout()
plt.show()
# Perform topic modeling
top_words = perform_topic_modeling(df, num_topics=8, num_words=10)
# Print topics and their top words
print("Topics and their top words:")
for i, topic_words in enumerate(top_words):
print(f"Topic {i+1}: {', '.join(topic_words)}")
# Visualize topic network
visualize_topic_network(top_words)
"""### 6.7 Sentiment Analysis
Sentiment analysis helps in understanding the emotional tone of the text, revealing whether the content is positive, negative, or neutral. This is particularly useful for gauging public opinion, track changes in sentiment over time, and identify key emotional drivers in the text data.
"""
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
def perform_sentiment_analysis(df: pd.DataFrame) -> pd.Series:
"""
Perform sentiment analysis on the text documents.
Args:
df (pd.DataFrame): The dataset containing text documents.
Returns:
pd.Series: A series of sentiment polarity scores.
"""
return df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
# Perform sentiment analysis
df['sentiment'] = perform_sentiment_analysis(df)
# Visualize sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], kde=True)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()
# Calculate average sentiment by category
avg_sentiment = df.groupby('category')['sentiment'].mean().sort_values(ascending=False)
# Visualize average sentiment by category
plt.figure(figsize=(12, 6))
avg_sentiment.plot(kind='bar')
plt.title('Average Sentiment by Category')
plt.xlabel('Category')
plt.ylabel('Average Sentiment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Find most positive and negative documents
most_positive = df.loc[df['sentiment'].idxmax()]
most_negative = df.loc[df['sentiment'].idxmin()]
print("Most positive document:")
print(f"Category: {most_positive['category']}")
print(f"Sentiment score: {most_positive['sentiment']:.4f}")
print(f"Text: {most_positive['text'][:500]}...")
print("\nMost negative document:")
print(f"Category: {most_negative['category']}")
print(f"Sentiment score: {most_negative['sentiment']:.4f}")
print(f"Text: {most_negative['text'][:500]}...")
# Correlation between document length and sentiment
df['doc_length'] = df['text'].str.len()
correlation = df['sentiment'].corr(df['doc_length'])
plt.figure(figsize=(10, 6))
plt.scatter(df['doc_length'], df['sentiment'], alpha=0.5)
plt.title(f'Document Length vs Sentiment (Correlation: {correlation:.4f})')
plt.xlabel('Document Length')
plt.ylabel('Sentiment')
plt.show()
"""###6.8 Text Classification
Text classification involves categorizing text into predefined categories. This can automate tasks such as tagging documents, detecting spam, or classifying sentiment. Here is a basic implementation using a machine learning model:
"""
def classify_text(df: pd.DataFrame):
"""
Classify text documents into predefined categories using a logistic regression model.
Args:
df (pd.DataFrame): The dataset containing text documents and their categories.
"""
# Preprocess text
df['processed_text'] = df['text'].apply(preprocess_text)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['category'], test_size=0.2, random_state=42)
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
# Predict and evaluate
y_pred = model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))
# Perform text classification
classify_text(df)
"""- `Accuracy`: Overall accuracy of the model across all classes
- `precision`: Proportion of true positive predictions among all positive predictions for each class
- `recall`: Proportion of true positive predictions among all actual positive instances for each class
- `f1-score`: Harmonic mean of precision and recall for each class
- `support`: Number of instances for each class in the test set
Additional rows:
- `accuracy`: Overall accuracy (repeated from top)
- `macro avg`: Unweighted mean of metrics across all classes
- `weighted avg`: Weighted average of metrics, accounting for class imbalance
###6.9 Text Summarization
Text summarization generates a concise summary of a longer text, useful for quickly understanding large volumes of text. Here's an implementation using the gensim library:
"""
from transformers import pipeline
def summarize_text(text: str, max_length: int, min_length: int, max_input_length: int = 1024) -> str:
"""
Summarize a text document using Hugging Face Transformers.
Args:
text (str): The input text document.
max_length (int): Maximum length of the summary.
min_length (int): Minimum length of the summary.
max_input_length (int): Maximum length of input text for the model.
Returns:
str: The summarized text.
"""
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def truncate_text(text):
"""
Truncate the input text to the maximum input length for the model.
"""
tokenized_text = text.split()
if len(tokenized_text) > max_input_length:
return ' '.join(tokenized_text[:max_input_length])
return text
truncated_text = truncate_text(text)
return summarizer(truncated_text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
# Select two example documents
example_texts = df['text'].iloc[:2]
# Summarize the example documents with explicit lengths
summarized_texts = example_texts.apply(lambda x: summarize_text(x, max_length=50, min_length=25))
# Display the original and summarized versions
for i, (original, summarized) in enumerate(zip(example_texts, summarized_texts)):
print(f"Original Text {i+1}:\n{original}\n")
print(f"Summarized Text {i+1}:\n{summarized}\n")
"""###6.10 Part-of-Speech (POS) Tagging
POS tagging assigns parts of speech to each word in a text. Here's how to perform POS tagging using spaCy:
"""
def pos_tagging(text: str) -> str:
"""
Perform POS tagging on a text document.
Args:
text (str): The input text document.
Returns:
str: The text with POS tags.
"""
doc = nlp(text)
return ' '.join([f"{token.text}/{token.pos_}" for token in doc])
# Select two example documents
example_texts = df['text'].iloc[:2]
# Apply POS tagging to the example documents
pos_tagged_texts = example_texts.apply(pos_tagging)
# Display the original and POS-tagged versions
for i, (original, pos_tagged) in enumerate(zip(example_texts, pos_tagged_texts)):
print(f"Original Text {i+1}:\n{original}\n")
print(f"POS-Tagged Text {i+1}:\n{pos_tagged}\n")
"""###6.11 Dependency Parsing
Dependency parsing identifies the grammatical structure of a sentence, showing the relationships between words. This is useful for deep linguistic analysis and understanding complex sentence structures.
"""
def dependency_parsing(text: str):
"""
Perform dependency parsing on a text document.
Args:
text (str): The input text document.
"""
doc = nlp(text)
for token in doc:
print(f"{token.text} ({token.dep_}) --> {token.head.text}")
# Select an example document