Python: Clean Text
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).
import re
from nltk.corpus import stopwords
# Ensure NLTK stopwords are downloaded
import nltk
nltk.download('stopwords')
def clean_text(text, custom_stopwords=None):
"""
Clean the input text by removing file paths, metadata, stopwords, and custom keywords.
Args:
text (str): The input text to process.
custom_stopwords (set, optional): A set of custom stopwords to remove. Defaults to None.
Returns:
str: The cleaned text.
"""
# Step 1: Convert text to lowercase
text = text.lower()
# Step 2: Remove file paths using regex
text = re.sub(r"[a-zA-Z]:\\[^ ]+", "", text) # Remove Windows file paths
text = re.sub(r"\/[^ ]+", "", text) # Remove Unix-style file paths
# Step 3: Remove numbers and special characters
text = re.sub(r"\d+", "", text) # Remove numbers
text = re.sub(r"[^\w\s]", "", text) # Remove punctuation and special characters
# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
if custom_stopwords:
stop_words.update(custom_stopwords)
tokens = text.split()
filtered_tokens = [word for word in tokens if word not in stop_words]
# Step 5: Join the tokens back into a single string
return " ".join(filtered_tokens)
# Example usage
text = "who else going into the new year single 😊 c:\\ZE\\notebook\\DS2019 follow_rts_likes follow rts "
custom_stopwords = {"follow", "rts", "likes", "everyone", "retweets"}
cleaned_text = clean_text(text, custom_stopwords)
print(cleaned_text)