MT trained on crawled data still sucks
Common Crawl contains a lot of Google Translate output. See if you can guess the source material
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
def xlate(text, src, trg):
tokenizer.src_lang = src
encoded = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(trg))
out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return out[0]
xlate("Litwo! Ojczyzno moja! ty jesteś jak zdrowie; ile cię trzeba cenić, ten tylko się dowie, Kto cię stracił.", "pl", "ga")
xlate("Hello, how are you?", "en", "ga")