Skip to main content

Command Palette

Search for a command to run...

Python: Clean Text

Published
1 min read
M

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).

import re
from nltk.corpus import stopwords

# Ensure NLTK stopwords are downloaded
import nltk
nltk.download('stopwords')

def clean_text(text, custom_stopwords=None):
    """
    Clean the input text by removing file paths, metadata, stopwords, and custom keywords.

    Args:
        text (str): The input text to process.
        custom_stopwords (set, optional): A set of custom stopwords to remove. Defaults to None.

    Returns:
        str: The cleaned text.
    """
    # Step 1: Convert text to lowercase
    text = text.lower()

    # Step 2: Remove file paths using regex
    text = re.sub(r"[a-zA-Z]:\\[^ ]+", "", text)  # Remove Windows file paths
    text = re.sub(r"\/[^ ]+", "", text)          # Remove Unix-style file paths

    # Step 3: Remove numbers and special characters
    text = re.sub(r"\d+", "", text)              # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)          # Remove punctuation and special characters

    # Step 4: Remove stopwords
    stop_words = set(stopwords.words('english'))
    if custom_stopwords:
        stop_words.update(custom_stopwords)

    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Step 5: Join the tokens back into a single string
    return " ".join(filtered_tokens)

# Example usage
text = "who else going into the new year single 😊 c:\\ZE\\notebook\\DS2019 follow_rts_likes follow rts "
custom_stopwords = {"follow", "rts", "likes", "everyone", "retweets"}
cleaned_text = clean_text(text, custom_stopwords)
print(cleaned_text)