ishworrsubedii commited on
Commit
b368e21
1 Parent(s): 2c6b8d9

Integrated speech transcription

Browse files
app.py CHANGED
@@ -6,10 +6,10 @@ from fastapi import FastAPI, File, UploadFile
6
  from pydantic import BaseModel
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from langchain_community.document_loaders import UnstructuredURLLoader
 
9
 
 
10
 
11
-
12
- app = FastAPI(title = "ConversAI", root_path = "/api/v1")
13
  app.add_middleware(
14
  CORSMiddleware,
15
  allow_origins=["*"],
@@ -18,29 +18,33 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
 
 
 
21
  @app.post("/signup")
22
  async def signup(username: str, password: str):
23
- response = createUser(username = username, password = password)
24
  return response
25
 
26
 
27
  @app.post("/login")
28
  async def login(username: str, password: str):
29
- response = matchPassword(username = username, password = password)
30
  return response
31
 
32
 
33
  @app.post("/newChatbot")
34
  async def newChatbot(chatbotName: str, username: str):
35
- currentBotCount = len(listTables(username = username)["output"])
36
- limit = client.table("ConversAI_UserConfig").select("chatbotLimit").eq("username", username).execute().data[0]["chatbotLimit"]
 
37
  if currentBotCount >= int(limit):
38
  return {
39
  "output": "CHATBOT LIMIT EXCEEDED"
40
  }
41
  client.table("ConversAI_ChatbotInfo").insert({"username": username, "chatbotname": chatbotName}).execute()
42
  chatbotName = f"convai-{username}-{chatbotName}"
43
- return createTable(tablename = chatbotName)
44
 
45
 
46
  @app.post("/addPDF")
@@ -53,11 +57,13 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
53
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
54
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
55
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
56
- limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
 
57
  newCount = currentCount + len(text)
58
  if newCount < int(limit):
59
- client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
60
- return addDocuments(text = text, vectorstore = vectorstore)
 
61
  else:
62
  return {
63
  "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
@@ -67,7 +73,7 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
67
  @app.post("/scanAndReturnText")
68
  async def returnText(pdf: UploadFile = File(...)):
69
  pdf = await pdf.read()
70
- text = getTextFromImagePDF(pdfBytes = pdf)
71
  return text
72
 
73
 
@@ -77,10 +83,12 @@ async def addText(vectorstore: str, text: str):
77
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
78
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
79
  newCount = currentCount + len(text)
80
- limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
 
81
  if newCount < int(limit):
82
- client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
83
- return addDocuments(text = text, vectorstore = vectorstore)
 
84
  else:
85
  return {
86
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
@@ -100,10 +108,12 @@ async def addText(addQaPair: AddQAPair):
100
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
101
  qa = f"QUESTION: {addQaPair.question}\tANSWER: {addQaPair.answer}"
102
  newCount = currentCount + len(qa)
103
- limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
 
104
  if newCount < int(limit):
105
- client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
106
- return addDocuments(text = qa, vectorstore = addQaPair.vectorstore)
 
107
  else:
108
  return {
109
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
@@ -115,20 +125,24 @@ async def addWebsite(vectorstore: str, websiteUrls: list[str]):
115
  urls = websiteUrls
116
  loader = UnstructuredURLLoader(urls=urls)
117
  docs = loader.load()
118
- text = "\n\n".join([f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
 
119
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
120
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
121
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
122
  newCount = currentCount + len(text)
123
- limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
 
124
  if newCount < int(limit):
125
- client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
126
- return addDocuments(text = text, vectorstore = vectorstore)
 
127
  else:
128
  return {
129
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
130
  }
131
 
 
132
  @app.post("/answerQuery")
133
  async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
134
  return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
@@ -140,15 +154,18 @@ async def delete(chatbotName: str):
140
  client.table('ConversAI_ChatbotInfo').delete().eq('username', username).eq('chatbotname', chatbotName).execute()
141
  return deleteTable(tableName=chatbotName)
142
 
 
143
  @app.post("/listChatbots")
144
  async def delete(username: str):
145
  return listTables(username=username)
146
 
 
147
  @app.post("/getLinks")
148
  async def crawlUrl(baseUrl: str):
149
  return {
150
  "urls": getLinks(url=baseUrl, timeout=30)
151
- }
 
152
 
153
  @app.post("/getCurrentCount")
154
  async def getCount(vectorstore: str):
@@ -156,11 +173,12 @@ async def getCount(vectorstore: str):
156
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
157
  return {
158
  "currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
159
- }
 
160
 
161
  @app.post("/getYoutubeTranscript")
162
  async def getYTTranscript(urls: str):
163
- return getTranscript(urls = urls)
164
 
165
 
166
  @app.post("/analyzeData")
@@ -169,10 +187,10 @@ async def analyzeAndAnswer(query: str, file: UploadFile = File(...)):
169
  try:
170
  if extension in ["xls", "xlsx", "xlsm", "xlsb"]:
171
  df = pd.read_excel(io.BytesIO(await file.read()))
172
- response = analyzeData(query = query, dataframe = df)
173
  elif extension == "csv":
174
  df = pd.read_csv(io.BytesIO(await file.read()))
175
- response = analyzeData(query = query, dataframe = df)
176
  else:
177
  response = "INVALID FILE TYPE"
178
  return {
@@ -181,4 +199,4 @@ async def analyzeAndAnswer(query: str, file: UploadFile = File(...)):
181
  except:
182
  return {
183
  "output": "UNABLE TO ANSWER QUERY"
184
- }
 
6
  from pydantic import BaseModel
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from langchain_community.document_loaders import UnstructuredURLLoader
9
+ from src.api.speech_api import speech_translator_router
10
 
11
+ app = FastAPI(title="ConversAI", root_path="/api/v1")
12
 
 
 
13
  app.add_middleware(
14
  CORSMiddleware,
15
  allow_origins=["*"],
 
18
  allow_headers=["*"],
19
  )
20
 
21
+ app.include_router(speech_translator_router, prefix="/speech")
22
+
23
+
24
  @app.post("/signup")
25
  async def signup(username: str, password: str):
26
+ response = createUser(username=username, password=password)
27
  return response
28
 
29
 
30
  @app.post("/login")
31
  async def login(username: str, password: str):
32
+ response = matchPassword(username=username, password=password)
33
  return response
34
 
35
 
36
  @app.post("/newChatbot")
37
  async def newChatbot(chatbotName: str, username: str):
38
+ currentBotCount = len(listTables(username=username)["output"])
39
+ limit = client.table("ConversAI_UserConfig").select("chatbotLimit").eq("username", username).execute().data[0][
40
+ "chatbotLimit"]
41
  if currentBotCount >= int(limit):
42
  return {
43
  "output": "CHATBOT LIMIT EXCEEDED"
44
  }
45
  client.table("ConversAI_ChatbotInfo").insert({"username": username, "chatbotname": chatbotName}).execute()
46
  chatbotName = f"convai-{username}-{chatbotName}"
47
+ return createTable(tablename=chatbotName)
48
 
49
 
50
  @app.post("/addPDF")
 
57
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
58
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
59
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
60
+ limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
61
+ "tokenLimit"]
62
  newCount = currentCount + len(text)
63
  if newCount < int(limit):
64
+ client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
65
+ "chatbotname", chatbotname).execute()
66
+ return addDocuments(text=text, vectorstore=vectorstore)
67
  else:
68
  return {
69
  "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
 
73
  @app.post("/scanAndReturnText")
74
  async def returnText(pdf: UploadFile = File(...)):
75
  pdf = await pdf.read()
76
+ text = getTextFromImagePDF(pdfBytes=pdf)
77
  return text
78
 
79
 
 
83
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
84
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
85
  newCount = currentCount + len(text)
86
+ limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
87
+ "tokenLimit"]
88
  if newCount < int(limit):
89
+ client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
90
+ "chatbotname", chatbotname).execute()
91
+ return addDocuments(text=text, vectorstore=vectorstore)
92
  else:
93
  return {
94
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
 
108
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
109
  qa = f"QUESTION: {addQaPair.question}\tANSWER: {addQaPair.answer}"
110
  newCount = currentCount + len(qa)
111
+ limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
112
+ "tokenLimit"]
113
  if newCount < int(limit):
114
+ client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
115
+ "chatbotname", chatbotname).execute()
116
+ return addDocuments(text=qa, vectorstore=addQaPair.vectorstore)
117
  else:
118
  return {
119
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
 
125
  urls = websiteUrls
126
  loader = UnstructuredURLLoader(urls=urls)
127
  docs = loader.load()
128
+ text = "\n\n".join(
129
+ [f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
130
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
131
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
132
  currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
133
  newCount = currentCount + len(text)
134
+ limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
135
+ "tokenLimit"]
136
  if newCount < int(limit):
137
+ client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
138
+ "chatbotname", chatbotname).execute()
139
+ return addDocuments(text=text, vectorstore=vectorstore)
140
  else:
141
  return {
142
  "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
143
  }
144
 
145
+
146
  @app.post("/answerQuery")
147
  async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
148
  return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
 
154
  client.table('ConversAI_ChatbotInfo').delete().eq('username', username).eq('chatbotname', chatbotName).execute()
155
  return deleteTable(tableName=chatbotName)
156
 
157
+
158
  @app.post("/listChatbots")
159
  async def delete(username: str):
160
  return listTables(username=username)
161
 
162
+
163
  @app.post("/getLinks")
164
  async def crawlUrl(baseUrl: str):
165
  return {
166
  "urls": getLinks(url=baseUrl, timeout=30)
167
+ }
168
+
169
 
170
  @app.post("/getCurrentCount")
171
  async def getCount(vectorstore: str):
 
173
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
174
  return {
175
  "currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
176
+ }
177
+
178
 
179
  @app.post("/getYoutubeTranscript")
180
  async def getYTTranscript(urls: str):
181
+ return getTranscript(urls=urls)
182
 
183
 
184
  @app.post("/analyzeData")
 
187
  try:
188
  if extension in ["xls", "xlsx", "xlsm", "xlsb"]:
189
  df = pd.read_excel(io.BytesIO(await file.read()))
190
+ response = analyzeData(query=query, dataframe=df)
191
  elif extension == "csv":
192
  df = pd.read_csv(io.BytesIO(await file.read()))
193
+ response = analyzeData(query=query, dataframe=df)
194
  else:
195
  response = "INVALID FILE TYPE"
196
  return {
 
199
  except:
200
  return {
201
  "output": "UNABLE TO ANSWER QUERY"
202
+ }
requirements.txt CHANGED
@@ -24,4 +24,5 @@ pdf2image
24
  sentence-transformers
25
  supabase
26
  unstructured
27
- urllib3
 
 
24
  sentence-transformers
25
  supabase
26
  unstructured
27
+ urllib3
28
+ gtts
src/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+
6
+ import logging.config
7
+ import yaml
8
+ import os
9
+
10
+ if os.path.exists("logs"):
11
+ pass
12
+ else:
13
+ os.makedirs("logs")
14
+
15
+ log_config_path = os.path.join(os.getcwd(), "logging_config.yaml")
16
+ with open(log_config_path, 'r') as file:
17
+ config = yaml.safe_load(file.read())
18
+
19
+ logging.config.dictConfig(config)
src/api/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
src/api/speech_api.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ import os
6
+ import tempfile
7
+ from fastapi.responses import JSONResponse
8
+ from fastapi import Form
9
+ from fastapi import UploadFile, HTTPException, status
10
+ from src.models.models import TextToSpeechRequest
11
+ from fastapi.routing import APIRouter
12
+ from src.pipeline.speech_transcription_pipeline import SpeechTranscriptionPipeline
13
+ from src import logging
14
+
15
+ speech_translator_router = APIRouter(tags=["SpeechTranscription"])
16
+ pipeline = SpeechTranscriptionPipeline()
17
+
18
+
19
+ @speech_translator_router.post(
20
+ "/text_to_speech",
21
+ description="""
22
+ ** For language refer below points**
23
+ **Supported Locales:**
24
+
25
+ - **English:**
26
+ - **Australia:**
27
+ - **Language:** en
28
+ - **TLD:** com.au
29
+ - **United Kingdom:**
30
+ - **Language:** en
31
+ - **TLD:** co.uk
32
+ - **United States:**
33
+ - **Language:** en
34
+ - **TLD:** us
35
+ - **Canada:**
36
+ - **Language:** en
37
+ - **TLD:** ca
38
+ - **India:**
39
+ - **Language:** en
40
+ - **TLD:** co.in
41
+ - **Ireland:**
42
+ - **Language:** en
43
+ - **TLD:** ie
44
+ - **South Africa:**
45
+ - **Language:** en
46
+ - **TLD:** co.za
47
+ - **Nigeria:**
48
+ - **Language:** en
49
+ - **TLD:** com.ng
50
+
51
+ - **French:**
52
+ - **Canada:**
53
+ - **Language:** fr
54
+ - **TLD:** ca
55
+ - **France:**
56
+ - **Language:** fr
57
+ - **TLD:** fr
58
+
59
+ - **Mandarin:**
60
+ - **China Mainland:**
61
+ - **Language:** zh-CN
62
+ - **TLD:** any
63
+ - **Taiwan:**
64
+ - **Language:** zh-TW
65
+ - **TLD:** any
66
+
67
+ - **Portuguese:**
68
+ - **Brazil:**
69
+ - **Language:** pt
70
+ - **TLD:** com.br
71
+ - **Portugal:**
72
+ - **Language:** pt
73
+ - **TLD:** pt
74
+
75
+ - **Spanish:**
76
+ - **Mexico:**
77
+ - **Language:** es
78
+ - **TLD:** com.mx
79
+ - **Spain:**
80
+ - **Language:** es
81
+ - **TLD:** es
82
+ - **United States:**
83
+ - **Language:** es
84
+ - **TLD:** us
85
+ """
86
+ )
87
+ async def text_to_speech(request: TextToSpeechRequest):
88
+ logging.info(f"Text to speech request received")
89
+ try:
90
+ audio_bytes = pipeline.text_to_speech(request.text, request.lang, request.tld)
91
+ if not audio_bytes:
92
+ logging.error(f"Audio generation failed.")
93
+ raise ValueError("Audio generation failed.")
94
+ logging.info(f"Text to speech request processed successfully")
95
+ return JSONResponse(content={"audio": audio_bytes, "status_code": status.HTTP_200_OK}, status_code=200)
96
+ except ValueError as ve:
97
+ logging.error(f"Error processing text to speech request: {str(ve)}")
98
+ raise HTTPException(status_code=400, detail=str(ve))
99
+ except Exception as e:
100
+ logging.error(f"Internal Server Error: {str(e)}")
101
+ raise HTTPException(status_code=500, detail="Internal Server Error")
102
+
103
+
104
+ @speech_translator_router.post(
105
+ "/speech_to_text",
106
+ description="""
107
+ ** Specify the language used in the audio **
108
+ **Supported Languages:**
109
+
110
+ **Major Languages:**
111
+ - **English:** en
112
+ - **Mandarin Chinese:** zh
113
+ - **Spanish:** es
114
+ - **French:** fr
115
+ - **German:** de
116
+ - **Italian:** it
117
+ - **Japanese:** ja
118
+ - **Korean:** ko
119
+ - **Russian:** ru
120
+ - **Portuguese:** pt
121
+ - **Arabic:** ar
122
+
123
+ **Additional Languages:**
124
+
125
+ - **Indic Languages:**
126
+ - **Hindi:** hi
127
+ - **Bengali:** bn
128
+ - **Tamil:** ta
129
+ - **Telugu:** te
130
+
131
+ - **Southeast Asian Languages:**
132
+ - **Vietnamese:** vi
133
+ - **Thai:** th
134
+ - **Indonesian:** id
135
+ - **Malay:** ms
136
+
137
+ - **African Languages:**
138
+ - **Swahili:** sw
139
+ - **Yoruba:** yo
140
+ - **Hausa:** ha
141
+
142
+ - **European Languages:**
143
+ - **Polish:** pl
144
+ - **Dutch:** nl
145
+ - **Swedish:** sv
146
+ - **Norwegian:** no
147
+ """
148
+ )
149
+ async def speech_to_text(audio: UploadFile, lang: str = Form(...)):
150
+ logging.info(f"Speech to text request received")
151
+ try:
152
+ audio_bytes = await audio.read()
153
+ if not audio_bytes:
154
+ logging.error(f"Empty audio file")
155
+ raise ValueError("Empty audio file")
156
+ except Exception as e:
157
+ logging.error(f"Invalid audio file {e}")
158
+ raise HTTPException(
159
+ status_code=status.HTTP_400_BAD_REQUEST,
160
+ detail="Invalid audio file"
161
+ )
162
+
163
+ try:
164
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
165
+ temp_audio_file.write(audio_bytes)
166
+ temp_audio_file_path = temp_audio_file.name
167
+ logging.info(f"Temporary audio file created at {temp_audio_file_path}")
168
+ except Exception as e:
169
+ logging.error(f"Could not process audio file{e}")
170
+ raise HTTPException(
171
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
172
+ detail="Could not process audio file"
173
+ )
174
+
175
+ try:
176
+ logging.info(f"Transcribing audio to text")
177
+ transcript = pipeline.speech_to_text(temp_audio_file_path, lang)
178
+ except FileNotFoundError as fnfe:
179
+ logging.error(f"Temporary file not found{fnfel}")
180
+ raise HTTPException(
181
+ status_code=status.HTTP_404_NOT_FOUND,
182
+ detail="Temporary file not found"
183
+ )
184
+ except Exception as e:
185
+ logging.error(f"Error processing speech-to-text: {str(e)}")
186
+ raise HTTPException(
187
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
188
+ detail="Error processing speech-to-text"
189
+ )
190
+ finally:
191
+ logging.info(f"Cleaning up temporary audio file")
192
+ if os.path.exists(temp_audio_file_path):
193
+ os.remove(temp_audio_file_path)
194
+
195
+ return JSONResponse(content={"transcript": transcript, "status_code": status.HTTP_200_OK}, status_code=200)
src/components/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
src/components/speech_to_text.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ import torch
6
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
+
8
+
9
+ class SpeechToText:
10
+ def __init__(self):
11
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
+ self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
+
14
+ model_id = "openai/whisper-large-v3"
15
+
16
+ self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
17
+ model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
18
+ ).to(self.device)
19
+ self.processor = AutoProcessor.from_pretrained(model_id)
20
+ self.speech_to_text_pipeline = self.pipeline()
21
+
22
+ def pipeline(self):
23
+ pipe = pipeline(
24
+ "automatic-speech-recognition",
25
+ model=self.model,
26
+ tokenizer=self.processor.tokenizer,
27
+ feature_extractor=self.processor.feature_extractor,
28
+ max_new_tokens=128, # max number of tokens to generate at a time
29
+ chunk_length_s=30, # length of audio chunks to process at a time
30
+ batch_size=16, # number of chunks to process at a time
31
+ return_timestamps=True,
32
+ torch_dtype=self.torch_dtype,
33
+ device=self.device,
34
+
35
+ )
36
+ return pipe
37
+
38
+ def transcribe_audio(self, audio, language: str = "en"):
39
+ """
40
+ This function is for transcribing audio to text.
41
+ :param audio: upload your audio file
42
+ :param language: choose the languaage of the audio file
43
+ :return:
44
+ """
45
+
46
+ result = self.speech_to_text_pipeline(audio, return_timestamps=True,
47
+ generate_kwargs={"language": language, "task": "translate"})
48
+ return result["chunks"], result["text"]
src/components/text_to_speech_gtts.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ import base64
6
+ from io import BytesIO
7
+
8
+ from gtts import gTTS
9
+ from gtts.tokenizer import pre_processors
10
+
11
+
12
+ class TextToSpeech:
13
+ def __init__(self):
14
+ self.preprocessing = [pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.word_sub,
15
+ pre_processors.abbreviations]
16
+
17
+ def conversion(self, text: str, lang: str, tld: str) -> str:
18
+ """
19
+ Convert text to speech and return the Base64-encoded MP3 data.
20
+ :param text: The text to convert to speech.
21
+ :param lang: The language in which to convert the text.
22
+ :return: Base64-encoded MP3 data as a string.
23
+ """
24
+ tts = gTTS(text=text, lang=lang, slow=False, tld=tld, pre_processor_funcs=self.preprocessing)
25
+ mp3_fp = BytesIO()
26
+ tts.write_to_fp(mp3_fp)
27
+ mp3_fp.seek(0)
28
+
29
+ mp3_binary = mp3_fp.getvalue()
30
+
31
+ base64_mp3 = base64.b64encode(mp3_binary).decode("utf-8")
32
+
33
+ return base64_mp3
src/models/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
src/models/models.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ from fastapi import UploadFile
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class TextToSpeechRequest(BaseModel):
10
+ text: str
11
+ lang: str
12
+ tld: str
13
+
14
+
15
+ class SpeechToTextRequest(BaseModel):
16
+ lang: str
src/pipeline/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
src/pipeline/speech_transcription_pipeline.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ from src.components.speech_to_text import SpeechToText
6
+ from src.components.text_to_speech_gtts import TextToSpeech
7
+
8
+
9
+ class SpeechTranscriptionPipeline:
10
+ def __init__(self):
11
+ self.speech_to_text_ = SpeechToText()
12
+ self.text_to_speech_ = TextToSpeech()
13
+
14
+ def text_to_speech(self, text: str, lang: str, tld: str) -> str:
15
+ """
16
+ Convert text to speech.
17
+ :param text: The text to convert to speech.
18
+ :param lang: The language in which to convert the text.
19
+ :return: The speech representation of the text.
20
+ """
21
+ speech = self.text_to_speech_.conversion(text, lang, tld)
22
+ return speech
23
+
24
+ def speech_to_text(self, audio, lang: str) -> str:
25
+ """
26
+ Convert speech to text.
27
+ :param audio: The audio data to convert to text.
28
+ :param lang: The language in which the audio is spoken.
29
+ :return: The text representation of the audio.
30
+ """
31
+ transcript_with_timestamp, transcript = self.speech_to_text_.transcribe_audio(audio=audio, language=lang)
32
+ return transcript
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
src/utils/utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-02
4
+ """
5
+ import yaml
6
+
7
+
8
+ def load_config(file_path):
9
+ with open(file_path, 'r') as file:
10
+ config = yaml.safe_load(file)
11
+ return config