-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_speech2roboExp_gui.py
220 lines (184 loc) · 10 KB
/
run_speech2roboExp_gui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import os
from datetime import datetime
import time
import configparser
import gradio as gr
import torch.cuda
import trimesh
import numpy as np
import shutil
import pickle
from typing import List
from speech2roboExp.model.audio_model import initialize_wav2vec2_model
from speech2roboExp.model.model import create_and_load_model
from speech2roboExp.inference import inference, decode_vert_offset
from speech2roboExp.utils.utils import convert_offset_seq_to_vert_seq
from render_utils.render import render_video
from utils.utils import create_dummy_sequence, add_eye_blink_arkit
from utils.livelink_utils import write_to_livelink_face
def run_inference(audio_filepath: str,
subj: str = 'FaceTalk_170809_00138_TA',
eye_blink: str = 'Yes',
result_path: str = 'results/speech2roboExp/gui',
subj_mesh_path: str = 'speech2roboExp/assets/human_voca',
progress: gr.Progress = gr.Progress(track_tqdm=True)) -> (str, str, str,
List[List[float]], List[List[float]]):
"""
Run inference, input an audio file and output an animation video
Args:
audio_filepath: path to input audio file
subj: subject name
eye_blink: whether to add eye blink
result_path: path to result directory
subj_mesh_path: path to subject mesh directory
progress: progress bar for gradio, track internal tqdm progress
Returns:
result_video_filepath: path to result video file
result_livelink_filepath: path to result in livelink face format
copied_audio_filepath: path to copied audio file
time_cost: time cost, [[generation_time, render_time]], seconds
speed: speed, [[generation_speed, render_speed]], frames per second
"""
dst_filename = (f"{os.path.splitext(os.path.basename(audio_filepath))[0]}_{subj}_"
f"{datetime.now().strftime('%Y%m%d-%H%M%S')}")
start_time = time.time()
# one-hot encodings
subj_encoding = subj_one_hot_labels[:, subj_name2id_dict[subj]]
# inference
blendshape_seq, _ = inference(audio_filepath, subj_encoding,
audio_processor, audio_model, art_model,
SPEECH_WINDOW_SIZE, neutral_vert, device,
flag_output_blendshape_only=True)
# add eye blink optionally
flag_add_eye_blink = eye_blink == 'Yes'
if flag_add_eye_blink:
blendshape_seq = add_eye_blink_arkit(blendshape_seq)
end_time = time.time()
generation_time = end_time - start_time
print('Generation time: {:.2f} seconds'.format(generation_time))
progress(0.9, desc='Decoding vertex sequence for rendering')
# compute vertex offset
vert_offset_seq = decode_vert_offset(blendshape_seq, art_model, device)
# convert vertex offset to vertex
subj_mesh = trimesh.load(os.path.join(subj_mesh_path, subj + '.obj'), process=False)
subj_neutral_vert = subj_mesh.vertices.T
vert_seq = convert_offset_seq_to_vert_seq(vert_offset_seq, subj_neutral_vert)
# render animation
result_video_filepath = os.path.join(result_path, dst_filename + '.mp4')
render_video(result_video_filepath, audio_filepath, vert_seq, faces, fps=FPS)
end_time = time.time()
render_time = end_time - start_time - generation_time
print('Render time: {:.2f} seconds'.format(render_time))
# save blendshape sequence to livelink face format, TODO: replace dummy values
result_livelink_filepath = os.path.join(result_path, dst_filename + '.txt')
head_pos_seq = create_dummy_sequence(blendshape_seq.shape[0], 3) # dummy head position
head_rot_seq = create_dummy_sequence(blendshape_seq.shape[0], 3) # dummy head rotation
left_eye_rot_seq = create_dummy_sequence(blendshape_seq.shape[0], 2) # dummy left eye rotation
right_eye_rot_seq = create_dummy_sequence(blendshape_seq.shape[0], 2) # dummy right eye rotation
write_to_livelink_face(result_livelink_filepath, blendshape_seq, head_pos_seq,
head_rot_seq, left_eye_rot_seq, right_eye_rot_seq, fps=FPS)
# copy audio file
copied_audio_filepath = os.path.join(result_path, dst_filename + '.wav')
shutil.copy(audio_filepath, copied_audio_filepath)
return (result_video_filepath, result_livelink_filepath, copied_audio_filepath,
[[generation_time, render_time]],
[[int(blendshape_seq.shape[0] / generation_time), int(blendshape_seq.shape[0] / render_time)]])
# set global parameters
cfg_filepath = 'speech2roboExp/config/infer.cfg'
# load config
if not os.path.exists(cfg_filepath):
raise FileNotFoundError('Config not found %s' % cfg_filepath)
cfg = configparser.RawConfigParser()
cfg.read(cfg_filepath)
# initialize
FPS = int(cfg['infer']['FPS']) # frame rate
SPEECH_WINDOW_SIZE = int(cfg['model']['SPEECH_WINDOW_SIZE']) # sliding window size for speech features
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# load template face mesh
template_mesh = trimesh.load(cfg['data']['TEMPLATE_MESH_FILEPATH'], process=False)
faces = template_mesh.faces.astype(np.int32)
neutral_vert = template_mesh.vertices.T
# initialize wav2vec2 model
print('Initializing wav2vec2 model...')
audio_processor, audio_model = initialize_wav2vec2_model()
audio_model.to(device).eval()
# initialize articulation model
print('Initializing articulation conv-net model...')
art_model, _ = create_and_load_model(cfg)
art_model.to(device).eval()
# initialize index mappings
subj_name2id_dict = pickle.load(open(cfg['data']['SUBJ_NAME2ID_FILEPATH'], 'rb'))
subj_one_hot_labels = np.eye(len(subj_name2id_dict.keys()))
# create gradio interface
demo = gr.Blocks(title="Speech2RoboExp Demo (v1.0)")
with demo:
gr.Markdown(
"""
## Welcome to the Speech2RoboExp Experience (v1.0)!
Get ready to bring your speech to life! Our cutting-edge Speech2RoboExp model can transform your voice into
dynamic facial expressions for both robot and 3D characters.
Here's how it works:
1. **Upload your speech audio**: Choose any audio file from your device.
2. **Select a speaking style**: Customize the animation by selecting a speaking style that suits your character.
3. **Generate**: Click the 'Generate' button and watch the magic happen!
Note that the first run may take a bit longer due to the initial setup. Subsequent runs will be faster.
You can also choose quick options from the examples provided below.
In the output, you'll get a preview of the facial motion and a downloadable blendshape sequence in
ARKit standard. This sequence can be used to animate a 3D face model in 3D modeling software like Blender,
Maya, or Unreal Engine, giving you the freedom to create stunning visual narratives.
Got questions or need help? Feel free to reach out to the author at [email](mailto:illiteratehit@gmail.com).
Dive in and enjoy the Speech2RoboExp experience!
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(sources=["upload"], type="filepath", label="Input speech", interactive=True)
with gr.Row():
subj_input = gr.Dropdown(label='Input speaking style', choices=list(subj_name2id_dict.keys()),
value='FaceTalk_170809_00138_TA', interactive=True)
eye_blink_input = gr.Radio(label='With eye blink', choices=['Yes', 'No'],
value='Yes', interactive=True)
generate_button = gr.Button(value="Generate", interactive=True)
gr.Examples(
[["speech2roboExp/test_samples/jobs_speech_1.wav", "FaceTalk_170809_00138_TA", "Yes"],
["speech2roboExp/test_samples/kobe_retire_speech.wav", "FaceTalk_170725_00137_TA", "Yes"],
["speech2roboExp/test_samples/churchill_speech_1.wav", "FaceTalk_170913_03279_TA", "Yes"],
["speech2roboExp/test_samples/mawanglang.wav", "FaceTalk_170809_00138_TA", "Yes"],
["speech2roboExp/test_samples/song_1.wav", "FaceTalk_170811_03275_TA", "Yes"]],
inputs=[audio_input, subj_input, eye_blink_input],
label="Examples"
)
with gr.Row():
video_output = gr.Video(label="Generated facial motion preview", height=550, width=332,
interactive=False)
with gr.Column():
gr.Markdown(
"""
Output blendshape sequence
--------------------------
Use of ARKit blendshape standard.
"""
)
livelink_file_output = gr.File(label="Livelink format", interactive=False)
audio_file_output = gr.File(label="Paired audio", interactive=False)
gr.Markdown(
"""
Summary
-------
"""
)
time_output = gr.Dataframe(headers=['Generation time', 'Render time'],
datatype=['number', 'number'],
interactive=False,
label="Time cost [seconds]")
speed_output = gr.Dataframe(headers=['Generation speed', 'Render speed'],
datatype=['number', 'number'],
interactive=False,
label="Speed [frames per second]")
generate_button.click(fn=run_inference, inputs=[audio_input, subj_input, eye_blink_input],
outputs=[video_output, livelink_file_output, audio_file_output, time_output, speed_output])
if __name__ == '__main__':
# run gradio interface
demo.queue()
demo.launch()
# demo.launch(server_name="0.0.0.0", server_port=7860)