Spaces:

ZiyuG
/

SignLanguage

Running on A10G

App Files Files Community

SignLanguage / evaluate.py

ZiyuG

Update evaluate.py

3ad8486 verified about 2 months ago

raw

history blame

14.4 kB

	import cv2, math
	import json, os, torch
	import numpy as np
	from sklearn.preprocessing import Normalizer
	from align import align_filter


	def merge_intervals_with_breaks(time_intervals, errors, max_break=1.5):
	print(f"时间区间: {time_intervals}")
	print(f"错误: {errors}")

	if not time_intervals:
	return []

	# Sort intervals based on starting times (not necessary here as input is sorted but good practice)
	sorted_intervals = sorted(zip(time_intervals, errors), key=lambda x: x[0][0])

	merged_intervals = []
	current_interval, current_error = sorted_intervals[0]

	for (start, end), error in sorted_intervals[1:]:
	# Check if the current interval error is the same and the break between intervals is <= 1.5 seconds
	if error == current_error and start - current_interval[1] <= max_break:
	# Merge intervals
	current_interval = (round(current_interval[0]), round(max(current_interval[1], end)))
	else:
	# Save the completed interval
	merged_intervals.append(((round(current_interval[0]), round(current_interval[1])), current_error))
	# merged_intervals.append((current_interval, current_error))
	# Start a new interval
	current_interval, current_error = (round(start), round(end)), error

	# Add the last interval
	merged_intervals.append((current_interval, current_error))

	return merged_intervals
	def findcos_single(k1, k2):
	u1 = np.array(k1).reshape(-1, 1)
	u2 = np.array(k2).reshape(-1, 1)
	source_representation, test_representation = u1, u2
	a = np.matmul(np.transpose(source_representation), test_representation)
	b = np.sum(np.multiply(source_representation, source_representation))
	c = np.sum(np.multiply(test_representation, test_representation))
	# return 1 - (a / (np.sqrt(b) * np.sqrt(c)))
	cosine_similarity = a / (np.sqrt(b) * np.sqrt(c))
	return 100 * (1 - (1 - cosine_similarity) / 2), 0


	def findCosineSimilarity_1(keypoints1, keypoints2):
	# transformer = Normalizer().fit(keypoints1)
	# keypoints1 = transformer.transform(keypoints1)
	user1 = np.concatenate((keypoints1[5:13], keypoints1[91:133]), axis=0).reshape(-1, 1)

	# transformer = Normalizer().fit(keypoints2)
	# keypoints2 = transformer.transform(keypoints2)
	user2 = np.concatenate((keypoints2[5:13], keypoints2[91:133]), axis=0).reshape(-1, 1)

	####ZIYU
	source_representation, test_representation = user1, user2
	a = np.matmul(np.transpose(source_representation), test_representation)
	b = np.sum(np.multiply(source_representation, source_representation))
	c = np.sum(np.multiply(test_representation, test_representation))
	# return 1 - (a / (np.sqrt(b) * np.sqrt(c)))
	cosine_similarity = a / (np.sqrt(b) * np.sqrt(c))
	return 100 * (1 - (1 - cosine_similarity) / 2), 0

	def load_json(path):
	with open(path, 'r') as file:
	return json.load(file)

	def eval(test, standard, tmpdir):
	test_p = tmpdir + "/user.mp4"
	standard_p = tmpdir + "/standard.mp4"
	os.system('python inferencer_demo.py ' + test_p + ' --pred-out-dir ' + tmpdir) # produce user.json

	scores = []

	align_filter(tmpdir + '/standard', tmpdir + '/user', tmpdir) # 帧对齐 produce aligned vedios

	data_00 = load_json(tmpdir + '/standard.json')
	data_01 = load_json(tmpdir + '/user.json')
	cap_00 = cv2.VideoCapture(standard_p)
	cap_01 = cv2.VideoCapture(test_p)
	# Define keypoint connections for both videos (example indices, you'll need to customize)
	connections1 = [(9,11), (7,9), (6,7), (6,8), (8,10), (7,13), (6,12), (12,13)]
	connections2 = [(130,133), (126,129), (122,125), (118,121), (114,117), (93,96), (97,100), (101,104), (105,108), (109,112)]

	# Determine the minimum length of JSON data to use
	min_length = min(len(data_00), len(data_01))

	frame_width = int(cap_00.get(cv2.CAP_PROP_FRAME_WIDTH))
	frame_height = int(cap_00.get(cv2.CAP_PROP_FRAME_HEIGHT))

	out = cv2.VideoWriter(tmpdir + '/output.mp4', cv2.VideoWriter_fourcc('H264'), 5, (frame_width2, frame_height*2))

	cap_00.set(cv2.CAP_PROP_POS_FRAMES, 0) # 初始化视频从头开始读取
	cap_01.set(cv2.CAP_PROP_POS_FRAMES, 0)
	comments = -1
	error_dict = {}
	cnt = 0

	line_width = 1 if frame_width // 300 == 0 else frame_width // 300
	# 开始逐帧处理两个视频
	while True:
	ret_00, frame_00 = cap_00.read() # 逐帧读取标准视频和用户视频的当前帧
	ret_01, frame_01 = cap_01.read()
	if not ret_00 and ret_01:
	comments = 0 #.append("请尝试加快手势的完成速度，并确保每个动作都清晰可见。")
	break # Stop if either video runs out of frames
	elif ret_00 and not ret_01:
	comments = 1 #.append("请尝试放慢手势的完成速度，确保每个动作都清晰可见。")
	break # Stop if either video runs out of frames
	elif not ret_00 and not ret_01:
	comments = 2
	break
	combined_frame_ori = np.hstack((frame_00, frame_01))

	# 获取视频当前的帧号
	frame_id_00 = int(cap_00.get(cv2.CAP_PROP_POS_FRAMES))
	frame_id_01 = int(cap_01.get(cv2.CAP_PROP_POS_FRAMES))

	# 处理标准视频中的关键点，并绘制关键点连接
	if frame_id_00 < min_length:
	keypoints_00 = data_00[frame_id_00]["instances"][0]["keypoints"]

	for (start, end) in connections1:
	start = start - 1
	end = end - 1
	if start < len(keypoints_00) and end < len(keypoints_00):
	start_point = (int(keypoints_00[start][0]), int(keypoints_00[start][1]))
	end_point = (int(keypoints_00[end][0]), int(keypoints_00[end][1]))
	cv2.line(frame_00, start_point, end_point, (255, 0, 0), line_width) # (BGR) Blue line
	for (start, end) in connections2:
	start = start - 1
	end = end - 1
	for i in range(start, end):
	if i < len(keypoints_00) and i + 1 < len(keypoints_00):
	start_point = (int(keypoints_00[i][0]), int(keypoints_00[i][1]))
	end_point = (int(keypoints_00[i + 1][0]), int(keypoints_00[i + 1][1]))
	cv2.line(frame_00, start_point, end_point, (255, 0, 0), line_width) # Blue line

	# 将部分关键点保存并绘制圆点，标记关键位置
	keypoints_00_ori = keypoints_00
	keypoints_00 = keypoints_00[5:13] + keypoints_00[91:133]

	for point in keypoints_00:
	cv2.circle(frame_00, (int(point[0]), int(point[1])), 1, (0, 210, 0), -1)


	# 处理用户视频中的关键点，并进行误差分析
	if frame_id_01 < min_length:
	error = []
	bigerror = []
	keypoints_01 = data_01[frame_id_01]["instances"][0]["keypoints"]

	for (start, end) in connections1:
	start = start - 1
	end = end - 1
	if start < len(keypoints_01) and end < len(keypoints_01):
	start_point = (int(keypoints_01[start][0]), int(keypoints_01[start][1]))
	end_point = (int(keypoints_01[end][0]), int(keypoints_01[end][1]))
	cur_score = findcos_single([[int(keypoints_01[start][0]), int(keypoints_01[start][1])], [int(keypoints_01[end][0]), int(keypoints_01[end][1])]], [[int(keypoints_00_ori[start][0]), int(keypoints_00_ori[start][1])], [int(keypoints_00_ori[end][0]), int(keypoints_00_ori[end][1])]])

	# 如果当前相似度小于 99.3，认为有误差，并记录下来
	if float(cur_score[0]) < 98.8 and start != 5:
	error.append(start)
	cv2.line(frame_01, start_point, end_point, (0, 0, 255), 2) # Red line
	# 如果相似度低于 98，记录为大误差
	if float(cur_score[0]) < 97.8:
	bigerror.append(start)
	else:
	cv2.line(frame_01, start_point, end_point, (255, 0, 0), line_width) # Blue line

	for (start, end) in connections2:
	start = start - 1
	end = end - 1
	for i in range(start, end):
	if i < len(keypoints_01) and i + 1 < len(keypoints_01):
	start_point = (int(keypoints_01[i][0]), int(keypoints_01[i][1]))
	end_point = (int(keypoints_01[i + 1][0]), int(keypoints_01[i + 1][1]))

	cur_score = findcos_single([[int(keypoints_01[i][0]), int(keypoints_01[i][1])], [int(keypoints_01[i + 1][0]), int(keypoints_01[i + 1][1])]], [[int(keypoints_00_ori[i][0]), int(keypoints_00_ori[i][1])], [int(keypoints_00_ori[i + 1][0]), int(keypoints_00_ori[i + 1][1])]])

	if float(cur_score[0]) < 98.8:
	error.append(start)
	cv2.line(frame_01, start_point, end_point, (0, 0, 255), 2) # Red line
	if float(cur_score[0]) < 97.8:
	bigerror.append(start)
	else:
	cv2.line(frame_01, start_point, end_point, (255, 0, 0), line_width) # Blue line

	# 将用户视频的关键点绘制为圆点
	keypoints_01 = keypoints_01[5:13] + keypoints_01[91:133]

	for point in keypoints_01:
	cv2.circle(frame_01, (int(point[0]), int(point[1])), 1, (0, 210, 0), -1)

	# Concatenate the images horizontally to display side by side
	combined_frame = np.hstack((frame_00, frame_01))

	if frame_id_00 < min_length and frame_id_01 < min_length:
	min_cos, min_idx = findCosineSimilarity_1(data_00[frame_id_00]["instances"][0]["keypoints"], data_01[frame_id_01]["instances"][0]["keypoints"])

	# 如果存在误差，将误差部分对应的人体部位加入内容列表
	if error != []:
	# print(error)
	content = []
	for i in error:
	if i in [5,7]: content.append('Left Arm')
	if i in [6,8]: content.append('Right Arm')
	if i > 90 and i < 112: content.append('Left Hand')
	if i >= 112: content.append('Right Hand')
	part = ""

	# 在视频帧上显示检测到的误差部位
	cv2.putText(combined_frame, "Please check: ", (int(frame_width1.75), int(frame_height0.2)), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)
	start_x = int(frame_width*1.75) + 10 #435 # 起始的 x 坐标
	start_y = int(frame_height*0.2) + 50 # 45
	line_height = 50 # 每一行文字的高度

	# 将每一个部位的内容绘制到帧上
	for i, item in enumerate(list(set(content))):
	text = "- " + item
	y_position = start_y + i * line_height
	cv2.putText(combined_frame, text, (start_x, y_position), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)

	# big
	if bigerror != []:
	bigcontent = []
	for i in bigerror:
	if i in [5,7]: bigcontent.append('Left Arm')
	if i in [6,8]: bigcontent.append('Right Arm')
	if i > 90 and i < 112: bigcontent.append('Left Hand')
	if i >= 112: bigcontent.append('Right Hand')

	# 记录当前帧的严重误差部位，存入 error_dict 中
	error_dict[cnt] = list(set(bigcontent))

	cnt += 1
	combined_frame = np.vstack((combined_frame_ori, combined_frame))
	out.write(combined_frame)
	scores.append(float(min_cos)) # 记录每一帧的相似度得分

	fps = 5 # Frames per second
	frame_numbers = list(error_dict.keys()) # List of frame numbers 获取含有严重误差的帧号列表
	time_intervals = [(frame / fps, (frame + 1) / fps) for frame in frame_numbers] # 将帧号转换为时间区间（秒）
	errors = [error_dict[frame] for frame in frame_numbers] # 每一帧对应的严重误差部位
	final_merged_intervals = merge_intervals_with_breaks(time_intervals, errors) # 合并相邻或相近的时间区间，并记录对应的误差部位
	out.release()

	# 返回三个结果：
	# 1. scores 的平均值，作为整体手势相似度的评分
	# 2. final_merged_intervals，合并后的误差时间区间及其对应的误差信息
	# 3. comments，用于给用户的速度建议（加快或放慢手势）
	return sum(scores) / len(scores), final_merged_intervals, comments

	def install():
	# if torch.cuda.is_available():
	# cu_version = torch.version.cuda
	# cu_version = f"cu{cu_version.replace('.', '')}" # Format it as 'cuXX' (e.g., 'cu113')
	# else:
	# cu_version = "cpu" # Fallback to CPU if no CUDA is available

	# torch_version = torch.__version__.split('+')[0] # Get PyTorch version without build info

	# pip_command = f'pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html'


	# os.system(pip_command)
	import subprocess
	subprocess.run(["pip", "uninstall", "-y", "numpy"], check=True)
	subprocess.run(["pip", "install", "numpy<2"], check=True)

	os.system('mim install mmengine')
	# os.system('mim install "mmcv"')
	# os.system('mim install "mmdet"')
	# os.system('mim install "mmpose"')
	# os.system('pip3 install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.4/index.html"')
	# os.system('pip3 install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.4/index.html')

	os.system('git clone https://github.com/open-mmlab/mmpose.git')
	os.chdir('mmpose')
	os.system('pip install -r requirements.txt')
	os.system('pip install -v -e .')
	os.chdir('../')

	os.system('git clone https://github.com/open-mmlab/mmdetection.git')
	os.chdir('cd mmdetection')
	os.system('pip install -v -e .')
	os.chdir('../')
	# os.system('mim install "mmpose>=1.1.0"')