Skip to main content

Command Palette

Search for a command to run...

Autolabel Airline Sentiment Dataset

Updated
2 min read
M

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).

[1] Prep dataset

set pandas df width

# Set the maximum column width to None
import pandas as pd
pd.set_option('display.max_colwidth', None)

get the source dataset, make a copy of it

# get the source dataset
import pandas as pd
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)

df_wip=df_airline[['text','airline_sentiment']].copy()
df_wip.head(1)

[2] Clean text

!pip install neattext

import neattext.functions as nfx

df_wip['clean'] = df_wip['text'] \
    .apply(nfx.remove_urls) \
    .apply(nfx.remove_userhandles) \
    .apply(nfx.remove_numbers) \
    .apply(nfx.fix_contractions) \
    .apply(nfx.remove_punctuations) \
    .apply(nfx.remove_hashtags) \
    .apply(str.lower)

display(df_wip[['clean']])

output:

[3] Label text

define task


# Define a function to label tweets
def label_tweet(text):
    # Initialize an empty list to store labels
    labels = []

    # Define keywords for each category (in lowercase)
    if any(word in text.lower() for word in ["bag", "luggage", "lost"]):
        labels.append("lost_luggage")
    if any(word in text.lower() for word in ["delay", "late", "cancelled flight"]):
        labels.append("flight_experience")
    if any(word in text.lower() for word in ["customer service", "support", "help"]):
        labels.append("customer_service")
    if any(word in text.lower() for word in ["booking", "rebook", "reservation"]):
        labels.append("flight_booking_problems")
    if any(word in text.lower() for word in ["food", "meal", "drink"]):
        labels.append("food_beverage")
    if any(word in text.lower() for word in ["safety", "security"]):
        labels.append("safety_security")
    if any(word in text.lower() for word in ["clean", "dirty", "condition"]):
        labels.append("aircraft_condition")
    if any(word in text.lower() for word in ["thank", "love", "great"]):
        labels.append("positive_feedback")
    # if not labels:  # If no specific category matches
    #     labels.append("miscellaneous")

    # return string
    # return ", ".join(labels)
    # return list
    return labels

run task

import pandas as pd
pd.set_option("max_colwidth", None)

# Apply the labeling function
df_wip["label"] = df_wip["clean"].apply(label_tweet)
df_wip["label_count"] =[len(list_item) for list_item in df_wip["label"] ]

display(df_wip[['clean','label','label_count']])

outcome:

[4] Visualize the distribution of text labels

define visualization task

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

def plot_label_distribution(df, label_column):
    """
    Plots the distribution of labels from a specified label column in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the label data.
    label_column (str): The name of the column containing the labels.

    Returns:
    None
    """
    # Step 1: Flatten the list of labels
    all_labels = [label for sublist in df[label_column] for label in sublist]

    # Step 2: Count occurrences of each label
    label_counts = Counter(all_labels)

    # Step 3: Create a visualization
    plt.figure(figsize=(10, 6))
    plt.bar(label_counts.keys(), label_counts.values(), color='skyblue')
    plt.title('Label Distribution')
    plt.xlabel('Labels')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Example usage
# df_wip = pd.DataFrame({'label': [['miscellaneous', 'flight_experience'], ['positive_feedback'], ['miscellaneous', 'customer_service'], ['flight_booking_problems']]})
# plot_label_distribution(df_wip, 'label')

run visualization task

plot_label_distribution(df_wip, 'label')

output:

Autolabel Airline Sentiment Dataset