Python: Preprocess Social Text
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).
Step 1: Remove trailing emoji
Step 2: Join negation words with their next word
Step 3: Remove stopwords
Step 4: Remove short words
Step 5: Lemmatize words
Step 6: Generate Ngram (optional)
import re
import emoji
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import wordnet
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
def process_text(text, set_NegationWord, stop_words=None, min_word_size=3):
"""
Process the input text by:
1. Removing trailing emojis.
2. Joining negation words with their next word.
3. Removing stopwords.
4. Removing short words based on min_word_size.
Args:
text (str): The input text to process.
set_NegationWord (set): A set of negation words.
stop_words (set, optional): A set of stopwords to remove. Defaults to NLTK English stopwords.
min_word_size (int, optional): Minimum word length to keep. Words shorter than this will be removed.
Returns:
str: The processed text.
"""
# Step 1: Remove trailing emoji
def remove_trailing_emoji(text):
tokens = text.split()
while tokens and emoji.is_emoji(tokens[-1]):
tokens = tokens[:-1]
return " ".join(tokens)
# Step 2: Join negation words with their next word
def join_negation_words(text):
words = text.split()
processed_words = []
i = 0
while i < len(words):
if words[i].lower() in set_NegationWord and i + 1 < len(words):
combined_word = f"{words[i]}_{words[i + 1]}"
processed_words.append(combined_word)
i += 2 # Skip the next word since it's already combined
continue
processed_words.append(words[i])
i += 1
return " ".join(processed_words)
# Step 3: Remove stopwords
def remove_stopwords(text, stop_words):
if stop_words is None:
stop_words = set(stopwords.words('english'))
tokens = text.split()
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
return " ".join(filtered_tokens)
# Step 4: Remove short words
def remove_short_words(text, min_word_size):
tokens = text.split()
filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
return " ".join(filtered_tokens)
# Step 5: Lemmatize words
def lemmatize_tokens(tokens, return_list=False):
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []
for token in tokens:
lemma = lemmatizer.lemmatize(token.lower())
# Check if the lemma exists in WordNet by looking for synsets
if wordnet.synsets(lemma):
lemmatized_tokens.append(lemma)
if return_list:
return lemmatized_tokens
return ' '.join(lemmatized_tokens)
# Step 6: Generate Ngram (optional)
def generate_ngram_nltk(tokens, ngram_range=(2, 2), return_tuple=True, return_list=False):
ngrams_list = []
min_n, max_n = ngram_range
for n in range(min_n, max_n + 1): # Generate n-grams for each value in the range
generated_ngrams = ngrams(tokens, n)
if return_tuple:
ngrams_list.extend(generated_ngrams) # Append as tuples
else:
ngrams_list.extend(["_".join(gram) for gram in generated_ngrams]) # Join tuples into strings
if return_list==False:
return ' '.join(ngrams_list)
else:
return ngrams_list
# Apply the steps in sequence
text = remove_trailing_emoji(text)
text = join_negation_words(text)
text = remove_stopwords(text, stop_words)
text = remove_short_words(text, min_word_size)
text = lemmatize_tokens(text.split(), return_list=False)
text = generate_ngram_nltk(text.split(), ngram_range=(1,2), return_tuple=False, return_list=False)
return text
# Example usage
# set_NegationWord = {
# 'no', 'not', 'cannot', 'never', 'neither', 'nor',
# 'none', 'nowhere', 'nothing', 'naught', 'nobody',
# 'never', 'nevermore', 'lack'
# }
text = "i literally have not been to sleep 🙄 because i cannot believe this is happening"
processed_text = process_text(text, set_NegationWord, min_word_size=3)
print(processed_text)