Skip to main content

Command Palette

Search for a command to run...

Python: Preprocess Social Text

Published
3 min read
M

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).

Step 1: Remove trailing emoji

Step 2: Join negation words with their next word

Step 3: Remove stopwords

Step 4: Remove short words

Step 5: Lemmatize words

Step 6: Generate Ngram (optional)

import re
import emoji
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import wordnet
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

def process_text(text, set_NegationWord, stop_words=None, min_word_size=3):
    """
    Process the input text by:
    1. Removing trailing emojis.
    2. Joining negation words with their next word.
    3. Removing stopwords.
    4. Removing short words based on min_word_size.

    Args:
        text (str): The input text to process.
        set_NegationWord (set): A set of negation words.
        stop_words (set, optional): A set of stopwords to remove. Defaults to NLTK English stopwords.
        min_word_size (int, optional): Minimum word length to keep. Words shorter than this will be removed.

    Returns:
        str: The processed text.
    """
    # Step 1: Remove trailing emoji
    def remove_trailing_emoji(text):
        tokens = text.split()
        while tokens and emoji.is_emoji(tokens[-1]):
            tokens = tokens[:-1]
        return " ".join(tokens)

    # Step 2: Join negation words with their next word
    def join_negation_words(text):
        words = text.split()
        processed_words = []
        i = 0
        while i < len(words):
            if words[i].lower() in set_NegationWord and i + 1 < len(words):
                combined_word = f"{words[i]}_{words[i + 1]}"
                processed_words.append(combined_word)
                i += 2  # Skip the next word since it's already combined
                continue
            processed_words.append(words[i])
            i += 1
        return " ".join(processed_words)

    # Step 3: Remove stopwords
    def remove_stopwords(text, stop_words):
        if stop_words is None:
            stop_words = set(stopwords.words('english'))
        tokens = text.split()
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        return " ".join(filtered_tokens)

    # Step 4: Remove short words
    def remove_short_words(text, min_word_size):
        tokens = text.split()
        filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
        return " ".join(filtered_tokens)

    # Step 5: Lemmatize words
    def lemmatize_tokens(tokens, return_list=False):
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = []

        for token in tokens:
            lemma = lemmatizer.lemmatize(token.lower())
            # Check if the lemma exists in WordNet by looking for synsets
            if wordnet.synsets(lemma):
                lemmatized_tokens.append(lemma)

        if return_list:
            return lemmatized_tokens
        return ' '.join(lemmatized_tokens)

    # Step 6: Generate Ngram (optional)
    def generate_ngram_nltk(tokens, ngram_range=(2, 2), return_tuple=True, return_list=False):
        ngrams_list = []
        min_n, max_n = ngram_range

        for n in range(min_n, max_n + 1):  # Generate n-grams for each value in the range
            generated_ngrams = ngrams(tokens, n)
            if return_tuple:
                ngrams_list.extend(generated_ngrams)  # Append as tuples
            else:
                ngrams_list.extend(["_".join(gram) for gram in generated_ngrams])  # Join tuples into strings

        if return_list==False:
          return ' '.join(ngrams_list)
        else:
          return ngrams_list



    # Apply the steps in sequence
    text = remove_trailing_emoji(text)
    text = join_negation_words(text)
    text = remove_stopwords(text, stop_words)
    text = remove_short_words(text, min_word_size)
    text = lemmatize_tokens(text.split(), return_list=False)
    text = generate_ngram_nltk(text.split(), ngram_range=(1,2), return_tuple=False, return_list=False)

    return text

# Example usage
# set_NegationWord = {
#     'no', 'not', 'cannot', 'never', 'neither', 'nor',
#     'none', 'nowhere', 'nothing', 'naught', 'nobody',
#     'never', 'nevermore', 'lack'
# }

text = "i literally have not been to sleep 🙄 because i cannot believe this is happening"
processed_text = process_text(text, set_NegationWord, min_word_size=3)
print(processed_text)