shibly100 commited on
Commit
b931ae8
·
verified ·
1 Parent(s): 3698468

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +43 -0
  2. evaluate.py +42 -0
  3. inference.py +32 -0
  4. streamlit +0 -0
  5. train.py +50 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import whisper
4
+ from langdetect import detect
5
+
6
+ st.set_page_config(page_title="ILR-Based Multilingual Language Assessment App")
7
+ st.title("ILR-Based Multilingual Language Assessment App")
8
+ st.write("Upload speech to assess your ILR level with transcription and feedback.")
9
+
10
+ # File uploader
11
+ uploaded_file = st.file_uploader("Upload Audio File (.wav, .mp3, .m4a)", type=["wav", "mp3", "m4a"])
12
+
13
+ if uploaded_file is not None:
14
+ # Save uploaded file to a temporary location
15
+ with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as tmp:
16
+ tmp.write(uploaded_file.read())
17
+ tmp_path = tmp.name
18
+
19
+ # Load whisper model
20
+ model = whisper.load_model("base")
21
+
22
+ # Transcribe audio
23
+ try:
24
+ result = model.transcribe(tmp_path)
25
+ transcription = result["text"]
26
+
27
+ # Display audio and transcription
28
+ st.audio(uploaded_file, format="audio/m4a")
29
+ st.subheader("Transcription")
30
+ st.write(transcription)
31
+
32
+ # Detect language
33
+ language = detect(transcription)
34
+ st.write(f"**Detected Language**: {language}")
35
+
36
+ # Placeholder for ILR scoring logic
37
+ st.subheader("ILR Level Feedback")
38
+ st.write("🧠 *Analyzing speech features...*")
39
+ st.success("Estimated ILR Level: **2+**")
40
+ st.info("To reach ILR Level 3: Improve connected speech, accuracy, and topic development.")
41
+
42
+ except Exception as e:
43
+ st.error(f"Error processing audio: {e}")
evaluate.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import numpy as np
3
+ import torch
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+
7
+ parser = argparse.ArgumentParser(description="Evaluate a fine-tuned DistilBERT model.")
8
+ parser.add_argument("--task", type=str, required=True,
9
+ choices=["classification", "nli"],
10
+ help="The evaluation task.")
11
+ parser.add_argument("--model_dir", type=str, required=True,
12
+ help="Path to your saved model directory.")
13
+ args = parser.parse_args()
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
16
+ model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)
17
+
18
+ if args.task == "classification":
19
+ dataset = load_dataset("glue", "sst2", split="validation").select(range(200))
20
+ dataset = dataset.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length"), batched=True)
21
+ labels = dataset["label"]
22
+ elif args.task == "nli":
23
+ dataset = load_dataset("snli", split="validation")
24
+ dataset = dataset.filter(lambda x: x["label"] != -1).select(range(200))
25
+ dataset = dataset.map(lambda e: tokenizer(e["premise"], e["hypothesis"], truncation=True, padding="max_length"), batched=True)
26
+ labels = dataset["label"]
27
+
28
+ dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
29
+ loader = torch.utils.data.DataLoader(dataset, batch_size=8)
30
+
31
+ all_preds = []
32
+
33
+ model.eval()
34
+ with torch.no_grad():
35
+ for batch in loader:
36
+ outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
37
+ logits = outputs.logits
38
+ preds = torch.argmax(logits, dim=-1)
39
+ all_preds.extend(preds.cpu().numpy())
40
+
41
+ accuracy = (np.array(all_preds) == np.array(labels)).mean()
42
+ print(f"Accuracy on {args.task} validation set: {accuracy:.2%}")
inference.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ from pathlib import Path
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+
6
+ def classify(text, model, tokenizer):
7
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
8
+ with torch.no_grad():
9
+ outputs = model(**inputs)
10
+ logits = outputs.logits
11
+ prediction = torch.argmax(logits, dim=1).item()
12
+ label = model.config.id2label.get(prediction, str(prediction))
13
+ return label
14
+
15
+ parser = argparse.ArgumentParser(description="Run inference with your fine-tuned DistilBERT model.")
16
+ parser.add_argument("--task", type=str, choices=["classification"], required=True, help="Task to run inference on.")
17
+ parser.add_argument("--model_dir", type=str, required=True, help="Relative or absolute path to model directory.")
18
+ parser.add_argument("--text", type=str, help="Input text to classify.")
19
+
20
+ args = parser.parse_args()
21
+
22
+ # Ensure the model directory is interpreted as a local folder
23
+ model_path = Path(args.model_dir).resolve()
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
26
+ model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
27
+
28
+ if args.task == "classification":
29
+ if not args.text:
30
+ raise ValueError("Please provide --text for classification.")
31
+ result = classify(args.text, model, tokenizer)
32
+ print(f"\nInput: {args.text}\nPrediction: {result}")
streamlit ADDED
File without changes
train.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
2
+ from datasets import load_dataset
3
+ import numpy as np
4
+ import evaluate
5
+
6
+ # Load dataset
7
+ dataset = load_dataset("imdb")
8
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
9
+
10
+ # Tokenization function
11
+ def tokenize_function(example):
12
+ return tokenizer(example["text"], padding="max_length", truncation=True)
13
+
14
+ # Tokenize dataset
15
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
16
+
17
+ # Load model
18
+ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
19
+
20
+ # Load accuracy metric
21
+ accuracy = evaluate.load("accuracy")
22
+
23
+ # Compute metrics function
24
+ def compute_metrics(eval_pred):
25
+ logits, labels = eval_pred
26
+ predictions = np.argmax(logits, axis=-1)
27
+ return accuracy.compute(predictions=predictions, references=labels)
28
+
29
+ # Define training arguments
30
+ training_args = TrainingArguments(
31
+ output_dir="./results",
32
+ evaluation_strategy="epoch",
33
+ learning_rate=2e-5,
34
+ per_device_train_batch_size=8,
35
+ per_device_eval_batch_size=8,
36
+ num_train_epochs=1,
37
+ weight_decay=0.01,
38
+ )
39
+
40
+ # Initialize Trainer
41
+ trainer = Trainer(
42
+ model=model,
43
+ args=training_args,
44
+ train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(2000)),
45
+ eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000)),
46
+ compute_metrics=compute_metrics,
47
+ )
48
+
49
+ # Train model
50
+ trainer.train()