Python: Generate Ngrams and Nterms

[1] Generate Ngrams

from itertools import islice

def generate_ngrams(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
    """
    Generate strict n-grams (adjacent word sequences) from the input text.

    Args:
        text (str): The input text to process.
        ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
        min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
        return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.

    Returns:
        list: A list of n-gram tuples or joined strings.
    """
    # Step 1: Tokenize the text into words
    tokens = text.split()

    # Step 2: Filter out short words based on min_word_size
    filtered_tokens = [word for word in tokens if len(word) >= min_word_size]

    # Step 3: Generate n-grams for each size in the range [min_n, max_n]
    min_n, max_n = ngram_size
    ngrams = []
    for n in range(min_n, max_n + 1):
        # Use a sliding window to generate n-grams
        ngrams.extend([filtered_tokens[i:i + n] for i in range(len(filtered_tokens) - n + 1)])

    # Step 4: Format the output
    if not return_tuple:
        ngrams = ["_".join(ngram) for ngram in ngrams]

    return ngrams

[2] Generate Nterms

from itertools import combinations

def generate_nterms(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
    """
    Generate n-gram combinations from the input text.

    Args:
        text (str): The input text to process.
        ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
        min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
        return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.

    Returns:
        list: A list of n-gram tuples or joined strings.
    """
    # Step 1: Tokenize the text into words
    tokens = text.split()

    # Step 2: Filter out short words based on min_word_size
    filtered_tokens = [word for word in tokens if len(word) >= min_word_size]

    # Step 3: Generate n-grams for each size in the range [min_n, max_n]
    min_n, max_n = ngram_size
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams.extend(list(combinations(filtered_tokens, n)))

    # Step 4: Format the output
    if not return_tuple:
        ngrams = ["_".join(ngram) for ngram in ngrams]

    return ngrams

# Example usage
text = "i literally have not been to sleep because i cannot believe this is happening"
ngrams = generate_terms(text, ngram_size=(1, 3), min_word_size=3, return_tuple=False)
print(ngrams)