[1] Generate Ngrams
from itertools import islice
def generate_ngrams(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
"""
Generate strict n-grams (adjacent word sequences) from the input text.
Args:
text (str): The input text to process.
ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.
Returns:
list: A list of n-gram tuples or joined strings.
"""
# Step 1: Tokenize the text into words
tokens = text.split()
# Step 2: Filter out short words based on min_word_size
filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
# Step 3: Generate n-grams for each size in the range [min_n, max_n]
min_n, max_n = ngram_size
ngrams = []
for n in range(min_n, max_n + 1):
# Use a sliding window to generate n-grams
ngrams.extend([filtered_tokens[i:i + n] for i in range(len(filtered_tokens) - n + 1)])
# Step 4: Format the output
if not return_tuple:
ngrams = ["_".join(ngram) for ngram in ngrams]
return ngrams
[2] Generate Nterms
from itertools import combinations
def generate_nterms(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
"""
Generate n-gram combinations from the input text.
Args:
text (str): The input text to process.
ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.
Returns:
list: A list of n-gram tuples or joined strings.
"""
# Step 1: Tokenize the text into words
tokens = text.split()
# Step 2: Filter out short words based on min_word_size
filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
# Step 3: Generate n-grams for each size in the range [min_n, max_n]
min_n, max_n = ngram_size
ngrams = []
for n in range(min_n, max_n + 1):
ngrams.extend(list(combinations(filtered_tokens, n)))
# Step 4: Format the output
if not return_tuple:
ngrams = ["_".join(ngram) for ngram in ngrams]
return ngrams
# Example usage
text = "i literally have not been to sleep because i cannot believe this is happening"
ngrams = generate_terms(text, ngram_size=(1, 3), min_word_size=3, return_tuple=False)
print(ngrams)