Translation
Flair
code
Files changed (1) hide show
  1. README.md +55 -3
README.md CHANGED
@@ -1,3 +1,55 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 1: Install required libraries
2
+ !pip install transformers datasets torch sentencepiece
3
+
4
+ # Step 2: Import Libraries
5
+ from datasets import load_dataset
6
+ from transformers import MarianMTModel, MarianTokenizer
7
+ import torch
8
+ from transformers import Trainer, TrainingArguments
9
+
10
+ # Step 3: Load the Dataset
11
+ dataset = load_dataset("cfilt/iitb-english-hindi")
12
+
13
+ # Check the structure of the dataset
14
+ print(dataset)
15
+
16
+ # Step 4: Prepare Tokenizer and Model
17
+ model_name = "Helsinki-NLP/opus-mt-en-hi"
18
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
19
+ model = MarianMTModel.from_pretrained(model_name)
20
+
21
+ # Step 5: Preprocess the Dataset
22
+ def preprocess_function(examples):
23
+ # Tokenize the English input and Hindi target
24
+ model_inputs = tokenizer(examples["en"], truncation=True, padding="max_length", max_length=128)
25
+ # Tokenize the Hindi target for training
26
+ with tokenizer.as_target_tokenizer():
27
+ labels = tokenizer(examples["hi"], truncation=True, padding="max_length", max_length=128)
28
+
29
+ model_inputs["labels"] = labels["input_ids"]
30
+ return model_inputs
31
+
32
+ # Apply preprocessing to the dataset
33
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
34
+
35
+ # Step 6: Training the Model
36
+ training_args = TrainingArguments(
37
+ output_dir="./results", # output directory for results
38
+ evaluation_strategy="epoch", # evaluate after every epoch
39
+ learning_rate=2e-5, # learning rate
40
+ per_device_train_batch_size=16, # batch size for training
41
+ per_device_eval_batch_size=16, # batch size for evaluation
42
+ num_train_epochs=3, # number of training epochs
43
+ logging_dir="./logs", # directory for storing logs
44
+ save_steps=500, # save checkpoint every 500 steps
45
+ )
46
+
47
+ # Initialize the Trainer
48
+ trainer = Trainer(
49
+ model=model, # the pre-trained model
50
+ args=training_args, # training arguments
51
+ train_dataset=tokenized_datasets["train"], # training dataset
52
+ eval_dataset=tokenized_datasets["validation"], # validation dataset
53
+ )
54
+
55
+ # Train