Reuter Topical News 21758 Dataset

import pandas as pd
df=pd.read_csv('https://archive.org/download/misc-dataset/reuters21578_news.zip/reuters21578_news.csv',)
# drop first column
df = df.drop(df.columns[0], axis=1)
# preview head
df.head()

Output:

df['topics'] contains stringified list. Iterate through the 'topics' column and convert to proper list.

# Iterate through the 'topics' column and convert to proper list
for i, topics in enumerate(df['topics']):
    if isinstance(topics, str):
        try:
            topics_list = ast.literal_eval(topics)
            if isinstance(topics_list, list):
                df.at[i, 'topics'] = topics_list
            else:
                df.at[i, 'topics'] = [topics]
        except (ValueError, SyntaxError):
            df.at[i, 'topics'] = [topics]
    elif isinstance(topics, list):
        pass  # Do nothing, the value is already a list
    else:
        df.at[i, 'topics'] = []  # Set to empty list if the value is None or another data type

df[['topics']]

Output:

Get number of unique topics.

# Get number of unique topics

# Create a list of all topics
all_topics = []
for topics in df['topics']:
    all_topics.extend(topics)

# Count the unique topic texts
unique_topic_counts = len(set(all_topics))

print(f"Number of unique topic texts: {unique_topic_counts}")

print(set(all_topics))

Output:

Number of unique topic texts: 120
{'ipi', 'gas', 'tapioca', 'coconut', 'lit', 'lei', 'acq', 'pet-chem', 'dfl', 'soy-meal', 'crude', 'dkr', 'cocoa', 'zinc', 'lin-oil', 'rape-meal', 'lin-meal', 'grain', 'jobs', 'cruzado', 'hog', 'yen', 'rapeseed', 'stg', 'wool', 'sunseed', 'nzdlr', 'reserves', 'plywood', 'sorghum', 'groundnut', 'barley', 'skr', 'instal-debt', 'nkr', 'income', 'coffee', 'cpi', 'l-cattle', 'silver', 'corn-oil', 'oat', 'housing', 'dmk', 'retail', 'oilseed', 'palladium', 'red-bean', 'nickel', 'groundnut-oil', 'money-fx', 'gold', 'tea', 'palmkernel', 'f-cattle', 'orange', 'wpi', 'pork-belly', 'nat-gas', 'sun-meal', 'platinum', 'money-supply', 'saudriyal', 'can', 'rupiah', 'trade', 'alum', 'austdlr', 'rape-oil', 'rand', 'coconut-oil', 'cottonseed', 'inventories', 'cpu', 'citruspulp', 'rye', 'jet', 'meal-feed', 'ringgit', 'earn', 'bop', 'cotton', 'soybean', 'castor-oil', 'iron-steel', 'bfr', 'ship', 'tin', 'propane', 'corn', 'potato', 'hk', 'rubber', 'fishmeal', 'lumber', 'dlr', 'peseta', 'sun-oil', 'interest', 'naphtha', 'fuel', 'gnp', 'castorseed', 'sugar', 'lead', 'wheat', 'copper', 'rice', 'carcass', 'strategic-metal', 'copra-cake', 'palm-oil', 'cotton-oil', 'livestock', 'veg-oil', 'cornglutenfeed', 'linseed', 'heat', 'soy-oil', 'sfr'}

Print all topic frequencies

import matplotlib.pyplot as plt
from collections import Counter

# Count the frequency of each topic
topic_counts = Counter(all_topics)

# Sort the topic counts in descending order
sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# Create the bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar([t[0] for t in sorted_topic_counts], [t[1] for t in sorted_topic_counts], color=plt.cm.Dark2(np.linspace(0, 1, len(sorted_topic_counts))))

# Add percentage labels on top of the bars
total_count = sum([t[1] for t in sorted_topic_counts])
for i, count in enumerate([t[1] for t in sorted_topic_counts]):
    percentage = (count / total_count) * 100
    plt.annotate(f"{percentage:.2f}%", (bars[i].get_x() + bars[i].get_width() / 2, count),
                 ha='center', va='bottom', xytext=(0, 5), textcoords='offset points')

plt.title('All Topics')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Output:

Print top 10 topic frequencies

import matplotlib.pyplot as plt
from collections import Counter
import ast
import pandas as pd

# Iterate through the 'topics' column and convert to proper list
for i, topics in enumerate(df['topics']):
    if isinstance(topics, str):
        try:
            topics_list = ast.literal_eval(topics)
            if isinstance(topics_list, list):
                df.at[i, 'topics'] = topics_list
            else:
                df.at[i, 'topics'] = [topics]
        except (ValueError, SyntaxError):
            df.at[i, 'topics'] = [topics]
    elif isinstance(topics, list):
        pass  # Do nothing, the value is already a list
    else:
        df.at[i, 'topics'] = []  # Set to empty list if the value is None or another data type

# Create a list of all topics
all_topics = []
for topics in df['topics']:
    all_topics.extend(topics)

# Count the frequency of each topic
topic_counts = Counter(all_topics)

# Sort the topic counts in descending order
sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 topics
top_10_topics = [t[0] for t in sorted_topic_counts[:10]]
top_10_counts = [t[1] for t in sorted_topic_counts[:10]]

# Create the bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_10_topics, top_10_counts, color=plt.cm.Dark2(np.linspace(0, 1, 10)))

# Add percentage labels on top of the bars
total_count = sum(top_10_counts)
for i, count in enumerate(top_10_counts):
    percentage = (count / total_count) * 100
    plt.annotate(f"{percentage:.2f}%", (bars[i].get_x() + bars[i].get_width() / 2, count),
                 ha='center', va='bottom', xytext=(0, 5), textcoords='offset points')

plt.title('Top 10 Topics')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Output:

Reuter Topical News 21758 Dataset

Comments

More from this blog

Getting Started with CI/CD Using GitHub Actions and Docker

Exploring Cloud Automation and DevOps with VirtualBox and Docker

Exploring Docker Networking and Understanding Cloud Networking

Exploring VirtualBox Networking and Understanding Cloud Networking

Docker Fundamentals: Imperative and Declarative Deployment

Command Palette

Comments

More from this blog