Reuter Topical News 21758 Dataset

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).
import pandas as pd
df=pd.read_csv('https://archive.org/download/misc-dataset/reuters21578_news.zip/reuters21578_news.csv',)
# drop first column
df = df.drop(df.columns[0], axis=1)
# preview head
df.head()
Output:

df['topics'] contains stringified list. Iterate through the 'topics' column and convert to proper list.
# Iterate through the 'topics' column and convert to proper list
for i, topics in enumerate(df['topics']):
if isinstance(topics, str):
try:
topics_list = ast.literal_eval(topics)
if isinstance(topics_list, list):
df.at[i, 'topics'] = topics_list
else:
df.at[i, 'topics'] = [topics]
except (ValueError, SyntaxError):
df.at[i, 'topics'] = [topics]
elif isinstance(topics, list):
pass # Do nothing, the value is already a list
else:
df.at[i, 'topics'] = [] # Set to empty list if the value is None or another data type
df[['topics']]
Output:

Get number of unique topics.
# Get number of unique topics
# Create a list of all topics
all_topics = []
for topics in df['topics']:
all_topics.extend(topics)
# Count the unique topic texts
unique_topic_counts = len(set(all_topics))
print(f"Number of unique topic texts: {unique_topic_counts}")
print(set(all_topics))
Output:
Number of unique topic texts: 120
{'ipi', 'gas', 'tapioca', 'coconut', 'lit', 'lei', 'acq', 'pet-chem', 'dfl', 'soy-meal', 'crude', 'dkr', 'cocoa', 'zinc', 'lin-oil', 'rape-meal', 'lin-meal', 'grain', 'jobs', 'cruzado', 'hog', 'yen', 'rapeseed', 'stg', 'wool', 'sunseed', 'nzdlr', 'reserves', 'plywood', 'sorghum', 'groundnut', 'barley', 'skr', 'instal-debt', 'nkr', 'income', 'coffee', 'cpi', 'l-cattle', 'silver', 'corn-oil', 'oat', 'housing', 'dmk', 'retail', 'oilseed', 'palladium', 'red-bean', 'nickel', 'groundnut-oil', 'money-fx', 'gold', 'tea', 'palmkernel', 'f-cattle', 'orange', 'wpi', 'pork-belly', 'nat-gas', 'sun-meal', 'platinum', 'money-supply', 'saudriyal', 'can', 'rupiah', 'trade', 'alum', 'austdlr', 'rape-oil', 'rand', 'coconut-oil', 'cottonseed', 'inventories', 'cpu', 'citruspulp', 'rye', 'jet', 'meal-feed', 'ringgit', 'earn', 'bop', 'cotton', 'soybean', 'castor-oil', 'iron-steel', 'bfr', 'ship', 'tin', 'propane', 'corn', 'potato', 'hk', 'rubber', 'fishmeal', 'lumber', 'dlr', 'peseta', 'sun-oil', 'interest', 'naphtha', 'fuel', 'gnp', 'castorseed', 'sugar', 'lead', 'wheat', 'copper', 'rice', 'carcass', 'strategic-metal', 'copra-cake', 'palm-oil', 'cotton-oil', 'livestock', 'veg-oil', 'cornglutenfeed', 'linseed', 'heat', 'soy-oil', 'sfr'}
Print all topic frequencies
import matplotlib.pyplot as plt
from collections import Counter
# Count the frequency of each topic
topic_counts = Counter(all_topics)
# Sort the topic counts in descending order
sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
# Create the bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar([t[0] for t in sorted_topic_counts], [t[1] for t in sorted_topic_counts], color=plt.cm.Dark2(np.linspace(0, 1, len(sorted_topic_counts))))
# Add percentage labels on top of the bars
total_count = sum([t[1] for t in sorted_topic_counts])
for i, count in enumerate([t[1] for t in sorted_topic_counts]):
percentage = (count / total_count) * 100
plt.annotate(f"{percentage:.2f}%", (bars[i].get_x() + bars[i].get_width() / 2, count),
ha='center', va='bottom', xytext=(0, 5), textcoords='offset points')
plt.title('All Topics')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Output:

Print top 10 topic frequencies
import matplotlib.pyplot as plt
from collections import Counter
import ast
import pandas as pd
# Iterate through the 'topics' column and convert to proper list
for i, topics in enumerate(df['topics']):
if isinstance(topics, str):
try:
topics_list = ast.literal_eval(topics)
if isinstance(topics_list, list):
df.at[i, 'topics'] = topics_list
else:
df.at[i, 'topics'] = [topics]
except (ValueError, SyntaxError):
df.at[i, 'topics'] = [topics]
elif isinstance(topics, list):
pass # Do nothing, the value is already a list
else:
df.at[i, 'topics'] = [] # Set to empty list if the value is None or another data type
# Create a list of all topics
all_topics = []
for topics in df['topics']:
all_topics.extend(topics)
# Count the frequency of each topic
topic_counts = Counter(all_topics)
# Sort the topic counts in descending order
sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
# Get the top 10 topics
top_10_topics = [t[0] for t in sorted_topic_counts[:10]]
top_10_counts = [t[1] for t in sorted_topic_counts[:10]]
# Create the bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_10_topics, top_10_counts, color=plt.cm.Dark2(np.linspace(0, 1, 10)))
# Add percentage labels on top of the bars
total_count = sum(top_10_counts)
for i, count in enumerate(top_10_counts):
percentage = (count / total_count) * 100
plt.annotate(f"{percentage:.2f}%", (bars[i].get_x() + bars[i].get_width() / 2, count),
ha='center', va='bottom', xytext=(0, 5), textcoords='offset points')
plt.title('Top 10 Topics')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Output:
