# Import libraries
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Download stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

categories = ["business","entertainment","politics","sport","tech"]

# Collect all articles
articles = []
for category in categories:
    folder_path = "bbc/" + category
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
                articles.append({"filename": filename, "category": category, "text": text})

# Create dataframe
df = pd.DataFrame(articles)

# Show the first few rows
df.head()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and digits
    text = re.sub(r"[^a-z\s]", " ", text)
    # Remove whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

# Add cleaned data to the dataframe
df["clean_text"] = df["text"].apply(preprocess)
df.head()

# Summary statistics
print("Dataset size:", len(df))
print("\nClass distribution:")
print(df["category"].value_counts())

# Category distribution
counts = df["category"].value_counts()
plt.figure()
plt.bar(counts.index, counts.values)
plt.title("Documents per Category")
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=20)
plt.show()

# Document length (characters)
df["char_len"] = df["text"].str.len()
plt.figure()
plt.hist(df["char_len"], bins=40)
plt.title("Distribution of Document Lengths (characters)")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.show()

# Create a grid for plots
fig, axes = plt.subplots(3, 2, figsize=(16, 15))
axes = axes.flatten()

# Plot wordclouds for each category
for i, cat in enumerate(categories):
    text = " ".join(df[df.category == cat].clean_text)
    wc = WordCloud(stopwords='english', background_color='white', width=800, height=400).generate(text)
    
    axes[i].imshow(wc, interpolation='bilinear')
    axes[i].axis('off')
    axes[i].set_title(f"Word Cloud for {cat}")

axes[5].axis("off")
plt.tight_layout()
plt.show()

Dataset size: 2225

Class distribution:
category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

# Define features and labels
X = df["clean_text"]
y = df["category"]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Transform text into numerical features with TF-IDF
vectoriser = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectoriser.fit_transform(X_train)
X_test_tfidf = vectoriser.transform(X_test)

# Train logistic regression classifier
classifier = LogisticRegression(max_iter=800)
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set using the classifier
predictions = classifier.predict(X_test_tfidf)

# Print model accuracy
print("Accuracy:", f"{accuracy_score(y_test, predictions):.4f}")
# Print Precision, recall and f1 score for all categories
print(classification_report(y_test, predictions))

# Compute and display confusion matrix
cm = confusion_matrix(y_test, predictions)
print("\nConfusion Matrix:\n")
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
display.plot(cmap=plt.cm.Blues)
plt.show()

Accuracy: 0.9865
               precision    recall  f1-score   support

     business       0.96      0.98      0.97       102
entertainment       0.99      0.99      0.99        77
     politics       1.00      0.99      0.99        84
        sport       1.00      1.00      1.00       102
         tech       0.99      0.97      0.98        80

     accuracy                           0.99       445
    macro avg       0.99      0.99      0.99       445
 weighted avg       0.99      0.99      0.99       445


Confusion Matrix:

# Fit LDA model using TF-IDF features
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_train_tfidf)

# Display top 10 words for each topic
terms = vectoriser.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}: ", [terms[j] for j in topic.argsort()[-10:]])

Topic 1:  ['said', 'users', 'video', 'technology', 'people', 'tv', 'digital', 'games', 'mobile', 'music']
Topic 2:  ['first', 'wales', 'team', 'match', 'cup', 'club', 'said', 'win', 'game', 'england']
Topic 3:  ['actress', 'oscar', 'films', 'star', 'actor', 'festival', 'award', 'awards', 'best', 'film']
Topic 4:  ['clothes', 'technologies', 'could used', 'midlands', 'shops', 'chip', 'broadcast', 'wearing', 'tags', 'rfid']
Topic 5:  ['uk', 'new', 'year', 'people', 'us', 'government', 'would', 'bn', 'mr', 'said']

	filename	category	text
0	289.txt	business	UK economy facing 'major risks'\n\nThe UK manu...
1	504.txt	business	Aids and climate top Davos agenda\n\nClimate c...
2	262.txt	business	Asian quake hits European shares\n\nShares in ...
3	276.txt	business	India power shares jump on debut\n\nShares in ...
4	510.txt	business	Lacroix label bought by US firm\n\nLuxury good...

	filename	category	text	clean_text
0	289.txt	business	UK economy facing 'major risks'\n\nThe UK manu...	uk economy facing major risks uk manufacturing...
1	504.txt	business	Aids and climate top Davos agenda\n\nClimate c...	aids climate top davos agenda climate change f...
2	262.txt	business	Asian quake hits European shares\n\nShares in ...	asian quake hits european shares shares europe...
3	276.txt	business	India power shares jump on debut\n\nShares in ...	india power shares jump debut shares india lar...
4	510.txt	business	Lacroix label bought by US firm\n\nLuxury good...	lacroix label bought us firm luxury goods grou...

BBC News NLP Pipeline: Text Classification & Topic Modelling¶

Data Loading¶

Preprocessing¶

Exploratory Data Analysis¶

Text Classification¶

Topic Modelling¶

Conclusion¶

Reference¶