Skip to main content

Command Palette

Search for a command to run...

Transformers Pipeline: Translate English to French

Updated
2 min read
M

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He is studying at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).

[1] Get dataset


import pandas as pd
# Load dataset
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)
# Display information about the dataset
df_airline.info()

[2] Clean

import pandas as pd
# !pip install neattext
import neattext.functions as nfx

# Clean the text data
df_airline['clean'] = df_airline['text'] \
    .apply(nfx.remove_userhandles) \
    .apply(nfx.remove_numbers) \
    .apply(nfx.fix_contractions) \
    .apply(nfx.remove_punctuations) \
    .apply(nfx.remove_hashtags) \
    .apply(nfx.remove_urls) \
    .apply(str.lower)

[3] Translate

(approx 30 min with CUDA)

from transformers import pipeline

# Load the translation pipeline with CUDA (GPU)
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", device=0)  # device=0 for the first GPU

# Function to translate text
def translate_to_french(text):
    try:
        # Ensure the input text is not empty
        if not text or not text.strip():
            return ""  # Return an empty string for empty or invalid input

        # Translate the text
        translated = translator(text, max_length=50)  # Adjust max_length as needed

        # Check if the translation result is not empty
        if translated and isinstance(translated, list) and len(translated) > 0:
            return translated[0]['translation_text']
        else:
            return ""  # Return an empty string if translation fails
    except Exception as e:
        print(f"Error translating text: {text}. Error: {e}")
        return ""  # Return an empty string in case of any error

# Example usage with a DataFrame
df_airline['clean_french'] = df_airline['clean'].apply(translate_to_french)

# Display the DataFrame
display(df_airline)

[4] Preview

display(df_airline[['phrase','phrase_french']].head())

[5] Save

df_airline.to_csv("airline-tweets-clean-french.csv", index=False, header=True)

.