ysharma HF staff commited on
Commit
b032967
·
1 Parent(s): 223ba1d

updates and fixes

Browse files
Files changed (1) hide show
  1. app.py +25 -23
app.py CHANGED
@@ -12,56 +12,58 @@ import cv2
12
 
13
  def resize(img_list):
14
  print("** inside resize **")
15
- print(img_list)
16
  resize_img_list = []
17
  for item in img_list:
18
  im = Image.open(item)
19
  imResize = im.resize((256,256), Image.ANTIALIAS)
20
  resize_img_list.append(np.array(imResize))
21
- print(type(resize_img_list[0]))
22
  return resize_img_list
23
 
24
 
25
  def merge_audio_video(entities_num, resize_img_list, text_input):
26
  print("** inside merge aud vid **")
27
- print(type(resize_img_list))
28
- print(type(resize_img_list[0]))
29
 
30
 
31
  #Convert text to speech using facebook's latest model from HF hub
32
  speech = text2speech(text_input)
33
- print('type of speech : ',type(speech))
34
- print(speech)
 
35
  wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
36
  #convert flac to mp3 audio format
37
- print('flac audio read', type(wav_audio))
38
  wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
39
  print('flac audio converted to mp3 audio' )
40
  print('now getting duration of this mp3 audio' )
41
  #getting audio clip's duration
42
  audio_length = int(MP3("audio.mp3").info.length)
 
43
 
44
  #Calculate the desired frame per second based on given audio length and entities identified
45
  fps= entities_num / audio_length #length of audio file
46
  fps = float(format(fps, '.5f'))
47
- print('fps is: ',fps)
48
 
49
  #String a list of images into a video and write to memory
50
  clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
51
  clip.write_videofile('my_vid_tmp.mp4')
52
- print('video clip created from images')
53
 
54
  # loading video file
55
  print('Starting video and audio merge')
56
  videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
57
- print('loading video-clip audio')
58
 
59
  # loading audio file
60
  audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
61
  print('loading mp3-format audio')
62
  # adding audio to the video clip
63
  mergedclip = videoclip.set_audio(audioclip)
64
- print('video and audio merged')
65
 
66
  #Getting size and frame count of merged video file
67
  print('Getting size and frame count of merged video file')
@@ -76,12 +78,12 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
76
  fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
77
 
78
  def text2speech(text):
79
- print('inside testtospeech')
80
- print(type(fastspeech))
81
  print(fastspeech)
82
  speech = fastspeech(text)
83
- print(type(speech))
84
- print(speech)
85
  return speech
86
 
87
  def engine(text_input):
@@ -101,27 +103,27 @@ def engine(text_input):
101
  print('img_list size:',len(img_list))
102
  #Resizing all images produced to same size
103
  resize_img_list = resize(img_list)
104
- print('back from resize')
105
 
106
 
107
  #Merge video and audio created above
108
  mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
109
- print('Back in engine')
110
- print(' merged clip type :',type(mergedclip))
111
- print('Writing the merged video clip to a file')
112
  mergedclip.to_videofile('mergedvideo.mp4')
113
  print('mergedvideo.mp4 created')
114
 
115
- print('#######################################################################################')
116
  return 'mergedvideo.mp4'
117
 
118
  app = gr.Interface(engine,
119
  gr.inputs.Textbox(lines=5, label="Input Text"),
120
  gr.outputs.Video(type=None, label='Final Merged video'),
121
- description="<div>Firstly, the Demo generates speech from input-text using facebook's fastspeech2-en-ljspeech from <a href='https://huggingface.co/facebook/fastspeech2-en-ljspeech' target='_blank'>HF hub</a>.<br>Then, takes the input-text and extracts the entities in it using Flair NER model from <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>HF Hub</a>. <br>Then, generate images using <a href='https://huggingface.co/spaces/multimodalart/latentdiffusion' target='_blank'>Multimodalart Space</a> for every entity separately.<br>Creates a video by stringing all the entity-images together. <br>Lastly, Fuses the AI generated audio and video together to create a coherent movie for you to watch. <br><br>A fun little app that lets you turn your text to video (well, in some ways atleast :) ). More the entities in your text, More time to build the output, More fun to watch.<br> Please expect build time of around 10-20 seconds per entity. For instance, in the first example there are 13 entities as per the NER model used here.</div>" ,
122
- examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones.", "April is the month of Easter weekend. Visit places like Statue of Liberty with friends. Take at least 200 dollars in cash with you. Use Android phone to find places in Newyork City."],
123
  title="Generate Video from Text",
124
- article="<br><div>For best results, make sure to enter a text that has entities listed on model card for <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>flair/ner-english-ontonotes-large</a>. Some examples of type of entities that will be helpful are - Date values, event names, building names, languages, locations, money value, organization names, famous people names, products and so on.</div><br><h4 style='font-size: 110%;margin-top:1em'>Who owns the videos produced by this demo?</h4><div><i>(Borrowing this from multimodalart spaces)</i> Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
125
 
126
  ).launch(enable_queue=True, debug=True)
127
 
 
12
 
13
  def resize(img_list):
14
  print("** inside resize **")
15
+ print('Entity-Images generated by multimodal interface are:',img_list)
16
  resize_img_list = []
17
  for item in img_list:
18
  im = Image.open(item)
19
  imResize = im.resize((256,256), Image.ANTIALIAS)
20
  resize_img_list.append(np.array(imResize))
21
+ print('Type of elements in the image list:',type(resize_img_list[0]))
22
  return resize_img_list
23
 
24
 
25
  def merge_audio_video(entities_num, resize_img_list, text_input):
26
  print("** inside merge aud vid **")
27
+ print('Type of image list variable: ',type(resize_img_list))
28
+ print('Type of elements in the image list: ',type(resize_img_list[0]))
29
 
30
 
31
  #Convert text to speech using facebook's latest model from HF hub
32
  speech = text2speech(text_input)
33
+ print('Back in merge_audio_video')
34
+ print('Type of speech variable : ',type(speech))
35
+ print('Type of Audio file: ',speech)
36
  wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
37
  #convert flac to mp3 audio format
38
+ print('COnverting flac format to mp3 using AudioSegment object:', type(wav_audio))
39
  wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
40
  print('flac audio converted to mp3 audio' )
41
  print('now getting duration of this mp3 audio' )
42
  #getting audio clip's duration
43
  audio_length = int(MP3("audio.mp3").info.length)
44
+ print('Audio length is :',audio_length)
45
 
46
  #Calculate the desired frame per second based on given audio length and entities identified
47
  fps= entities_num / audio_length #length of audio file
48
  fps = float(format(fps, '.5f'))
49
+ print('Based on number of entities/images and audio length, FPS is set as : ',fps)
50
 
51
  #String a list of images into a video and write to memory
52
  clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
53
  clip.write_videofile('my_vid_tmp.mp4')
54
+ print('video clip created successfully from images')
55
 
56
  # loading video file
57
  print('Starting video and audio merge')
58
  videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
59
+ print('loading video-clip')
60
 
61
  # loading audio file
62
  audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
63
  print('loading mp3-format audio')
64
  # adding audio to the video clip
65
  mergedclip = videoclip.set_audio(audioclip)
66
+ print('video and audio merged successfully')
67
 
68
  #Getting size and frame count of merged video file
69
  print('Getting size and frame count of merged video file')
 
78
  fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
79
 
80
  def text2speech(text):
81
+ print('** inside testtospeech **')
82
+ print('Loading the model through :',type(fastspeech))
83
  print(fastspeech)
84
  speech = fastspeech(text)
85
+ print('Type of variable in which file is stored:',type(speech))
86
+ print('Type of Audio file generated :',speech)
87
  return speech
88
 
89
  def engine(text_input):
 
103
  print('img_list size:',len(img_list))
104
  #Resizing all images produced to same size
105
  resize_img_list = resize(img_list)
106
+ print('back from resize into engine')
107
 
108
 
109
  #Merge video and audio created above
110
  mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
111
+ print('\n Back in engine')
112
+ print(' Merged clip type :',type(mergedclip))
113
+ print('Writing the merged video clip to a video file')
114
  mergedclip.to_videofile('mergedvideo.mp4')
115
  print('mergedvideo.mp4 created')
116
 
117
+ print('################################ Single Run Completed ##############################')
118
  return 'mergedvideo.mp4'
119
 
120
  app = gr.Interface(engine,
121
  gr.inputs.Textbox(lines=5, label="Input Text"),
122
  gr.outputs.Video(type=None, label='Final Merged video'),
123
+ description="<div>Firstly, the Demo generates speech from input-text using facebook's fastspeech2-en-ljspeech from <a href='https://huggingface.co/facebook/fastspeech2-en-ljspeech' target='_blank'>HF hub</a>.<br>Then, takes the input-text and extracts the entities in it using Flair NER model from <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>HF Hub</a>. <br>Then, generate images using <a href='https://huggingface.co/spaces/multimodalart/latentdiffusion' target='_blank'>Multimodalart Space</a> for every entity separately.<br>Creates a video by stringing all the entity-images together. <br>Lastly, Fuses the AI generated audio and video together to create a coherent movie for you to watch. <br><br>A fun little app that lets you turn your text to video (well, in some ways atleast :) ). More the entities in your text, More time to build the output, More fun to watch.<br> Please expect build time of around 10-20 seconds per entity. For instance, in the third and largest example there are 13 entities as per the NER model used here.</div>" ,
124
+ examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents.", "George is a citizen of Canada and speaks English and French fluently. His role model is the former president Obama. " , "On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones.", "April is the month of Easter weekend. Visit places like Statue of Liberty with friends. Take at least 200 dollars in cash with you. Use Android phone to find places in Newyork City."],
125
  title="Generate Video from Text",
126
+ article="<br><div>For best results, make sure to enter a text that has entities listed on model card for <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>flair/ner-english-ontonotes-large</a>. Some examples of type of entities that will be helpful are - Date values, event names, building names, languages, locations, money value, organization names, famous people names, products and so on.<br>Also note that, this Space loads the most awesome Multimodalart space as a gradio interface, hence if the latter space is down former goes down too.</div><br><h4 style='font-size: 110%;margin-top:1em'>Who owns the videos produced by this demo?</h4><div><i>(Borrowing this from multimodalart spaces)</i> Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
127
 
128
  ).launch(enable_queue=True, debug=True)
129