Autolabel Airline Sentiment Dataset
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).
[1] Prep dataset
set pandas df width
# Set the maximum column width to None
import pandas as pd
pd.set_option('display.max_colwidth', None)
get the source dataset, make a copy of it
# get the source dataset
import pandas as pd
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)
df_wip=df_airline[['text','airline_sentiment']].copy()
df_wip.head(1)
[2] Clean text
!pip install neattext
import neattext.functions as nfx
df_wip['clean'] = df_wip['text'] \
.apply(nfx.remove_urls) \
.apply(nfx.remove_userhandles) \
.apply(nfx.remove_numbers) \
.apply(nfx.fix_contractions) \
.apply(nfx.remove_punctuations) \
.apply(nfx.remove_hashtags) \
.apply(str.lower)
display(df_wip[['clean']])
output:

[3] Label text
define task
# Define a function to label tweets
def label_tweet(text):
# Initialize an empty list to store labels
labels = []
# Define keywords for each category (in lowercase)
if any(word in text.lower() for word in ["bag", "luggage", "lost"]):
labels.append("lost_luggage")
if any(word in text.lower() for word in ["delay", "late", "cancelled flight"]):
labels.append("flight_experience")
if any(word in text.lower() for word in ["customer service", "support", "help"]):
labels.append("customer_service")
if any(word in text.lower() for word in ["booking", "rebook", "reservation"]):
labels.append("flight_booking_problems")
if any(word in text.lower() for word in ["food", "meal", "drink"]):
labels.append("food_beverage")
if any(word in text.lower() for word in ["safety", "security"]):
labels.append("safety_security")
if any(word in text.lower() for word in ["clean", "dirty", "condition"]):
labels.append("aircraft_condition")
if any(word in text.lower() for word in ["thank", "love", "great"]):
labels.append("positive_feedback")
# if not labels: # If no specific category matches
# labels.append("miscellaneous")
# return string
# return ", ".join(labels)
# return list
return labels
run task
import pandas as pd
pd.set_option("max_colwidth", None)
# Apply the labeling function
df_wip["label"] = df_wip["clean"].apply(label_tweet)
df_wip["label_count"] =[len(list_item) for list_item in df_wip["label"] ]
display(df_wip[['clean','label','label_count']])
outcome:

[4] Visualize the distribution of text labels
define visualization task
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
def plot_label_distribution(df, label_column):
"""
Plots the distribution of labels from a specified label column in a DataFrame.
Parameters:
df (pd.DataFrame): The DataFrame containing the label data.
label_column (str): The name of the column containing the labels.
Returns:
None
"""
# Step 1: Flatten the list of labels
all_labels = [label for sublist in df[label_column] for label in sublist]
# Step 2: Count occurrences of each label
label_counts = Counter(all_labels)
# Step 3: Create a visualization
plt.figure(figsize=(10, 6))
plt.bar(label_counts.keys(), label_counts.values(), color='skyblue')
plt.title('Label Distribution')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Example usage
# df_wip = pd.DataFrame({'label': [['miscellaneous', 'flight_experience'], ['positive_feedback'], ['miscellaneous', 'customer_service'], ['flight_booking_problems']]})
# plot_label_distribution(df_wip, 'label')
run visualization task
plot_label_distribution(df_wip, 'label')
output:
