Skip to main content

Command Palette

Search for a command to run...

Transformers Pipeline: Translate English Phrase to French

Published
2 min read
M

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).

[1] Get dataset


import pandas as pd
# Load dataset
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)
# Display information about the dataset
df_airline.info()

[2] Clean

import pandas as pd
# !pip install neattext
import neattext.functions as nfx

# Clean the text data
df_airline['clean'] = df_airline['text'] \
    .apply(nfx.remove_userhandles) \
    .apply(nfx.remove_numbers) \
    .apply(nfx.fix_contractions) \
    .apply(nfx.remove_punctuations) \
    .apply(nfx.remove_hashtags) \
    .apply(nfx.remove_urls) \
    .apply(str.lower)

define phrase extraction task

# !pip install rake-nltk
from rake_nltk import Rake

def rake_extract_phrases(input_text):
    # Initialize RAKE
    rake = Rake()

    # Extract keywords and phrases from the input text
    rake.extract_keywords_from_text(input_text)

    # Get the ranked phrases with their scores
    ranked_phrases_with_scores = rake.get_ranked_phrases_with_scores()

    # Sort the phrases by score in descending order
    sorted_phrases = sorted(ranked_phrases_with_scores, key=lambda x: x[0], reverse=True)

    return sorted_phrases

# Example usage
input_string = "thank you we got on a different flight."
result = rake_extract_phrases(input_string)
print(result)

extract phrase

df_airline['phrase_score']=df_airline['clean'].apply(rake_extract_phrases)
df_airline['phrase']=[[item[1] for item in list_item] for list_item in df_airline['phrase_score']]
display(df_airline_phrase.head())

explode phrases into separate documents

df_airline_phrase = df_airline.explode('phrase').reset_index(drop=True)[['phrase', 'airline_sentiment', 'negativereason']]
display(df_airline_phrase)

select phrase documents

df_airline_phrase_selected=df_airline_phrase[
    (df_airline_phrase['airline_sentiment'] == 'negative') &
    (df_airline_phrase['negativereason'].notna()) &  # Use `notna()` to filter out NaN values
    (df_airline_phrase['negativereason'] == 'Customer Service Issue') &
    (df_airline_phrase['phrase'].str.split().str.len() >1)  # Filter phrases with 2 or more words
]
print(len(df_airline_phrase_selected))
df_airline_phrase_selected= df_airline_phrase_selected.dropna(subset=['phrase'])
print(len(df_airline_phrase_selected))
display(df_airline_phrase_selected.head())

[3] Translate

(approx 30 min with CUDA)

from transformers import pipeline

# Load the translation pipeline with CUDA (GPU)
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", device=0)  # device=0 for the first GPU

# Function to translate text
def translate_to_french(text):
    try:
        # Ensure the input text is not empty
        if not text or not text.strip():
            return ""  # Return an empty string for empty or invalid input

        # Translate the text
        translated = translator(text, max_length=50)  # Adjust max_length as needed

        # Check if the translation result is not empty
        if translated and isinstance(translated, list) and len(translated) > 0:
            return translated[0]['translation_text']
        else:
            return ""  # Return an empty string if translation fails
    except Exception as e:
        print(f"Error translating text: {text}. Error: {e}")
        return ""  # Return an empty string in case of any error

# Example usage with a DataFrame
df_airline_phrase_selected['phrase_french'] = df_airline_phrase_selected['phrase'].apply(translate_to_french)

# Display the DataFrame
display(df_airline_phrase_selected)

[4] Preview

display(df_airline_phrase_selected[['clean','clean_french']].head())

[5] Save

df_airline_phrase_selected.to_csv("airline-tweets-clean-phrase-french.csv", index=False, header=True)

.