package streaming_vad import ( "fmt" "math" ) const ( FrameFlagSpeechPre = iota FrameFlagSpeechStart FrameFlagSpeechPresent FrameFlagSpeechEnd FrameFlagSpeechPost ) const ( VadFlagPrepare = "VadFlagPrepare" //准备 VadFlagSpeaking = "VadFlagSpeaking" //说话中 VadFlagPause = "VadFlagPause" //逗号停顿 VadFlagNoSpeech = "VadFlagNoSpeech" //句号停顿 VadFlagUnknown = "VadFlagUnknown" //未知状态 ) type ParametersForFdType struct { SampleRate uint32 Threshold float32 MinThreshold float32 FrameLengthInSecond float32 StartRejectUpdateNoiseLevelTimeInSecond float32 StartRejectSpeechTimeInSecond float32 SpeechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度. SpeechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度. SpeechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度. SpeechStartRequiredLengthInSecond float32 SpeechStartConfirmRequiredLengthInSecond float32 SpeechPresentMaintainRequiredLengthInSecond float32 SpeechEndConfirmRequiredLengthInSecond float32 } func (pd *ParametersForFdType) Init () { pd.SampleRate = 8000 pd.Threshold = 150.0 pd.MinThreshold = 50.0 pd.FrameLengthInSecond = 0.01 //Start Reject pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2 pd.StartRejectSpeechTimeInSecond = 0.25 //Window Length pd.SpeechStartWindowLengthInSecond = 0.15 pd.SpeechPresentWindowLengthInSecond = 0.4 pd.SpeechEndConfirmWindowLengthInSecond = 0.15 //Required Length pd.SpeechStartRequiredLengthInSecond = 0.09 pd.SpeechStartConfirmRequiredLengthInSecond = 0.075 pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1 pd.SpeechEndConfirmRequiredLengthInSecond = 0.12 } type DecisionStateType struct { decisionFlag bool timeInMilliSecond uint32 } type FrameDecisionType struct { params ParametersForFdType sampleRate uint32 threshold float32 minThreshold float32 adaptFactor float32 // frameLengthInSecond float32 noiseLevelValue float32 startRejectUpdateNoiseLevelTimeInSecond float32 startRejectUpdateNoiseLevelFrameNumber uint32 startRejectSpeechTimeInSecond float32 startRejectSpeechTimeInMilliSecond uint32 speechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度. speechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度. speechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度. speechStartRequiredLengthInSecond float32 speechStartConfirmRequiredLengthInSecond float32 speechPresentMaintainRequiredLengthInSecond float32 speechEndConfirmRequiredLengthInSecond float32 decisionStateDeque []DecisionStateType decisionStateDequeSize uint32 decisionStateDequeIndex uint32 processedFramesNumber uint32 lastFrameFlag int thisFrameFlag int } func (fd *FrameDecisionType) Init (params ParametersForFdType) { fd.params = params fd.sampleRate = params.SampleRate fd.threshold = params.Threshold fd.minThreshold = params.MinThreshold fd.adaptFactor = fd.threshold fd.frameLengthInSecond = params.FrameLengthInSecond fd.noiseLevelValue = fd.threshold / 2.0 fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond) fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5) fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond //initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond { largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond } if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond { largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond } decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5) fd.RefreshDecisionStateDeque(decisionStateDequeSize) fd.processedFramesNumber = 0 fd.lastFrameFlag = FrameFlagSpeechPre fd.thisFrameFlag = FrameFlagSpeechPre } /* ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态 */ func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) { fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize) if resetThreshold { fd.threshold = fd.params.Threshold fd.adaptFactor = fd.threshold fd.noiseLevelValue = fd.threshold / 2.0 fd.processedFramesNumber = 0 } fd.lastFrameFlag = FrameFlagSpeechPre fd.thisFrameFlag = FrameFlagSpeechPre } func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) { fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize) fd.decisionStateDequeSize = decisionStateDequeSize fd.decisionStateDequeIndex = 0 } func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) { fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize } func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) { if len(fd.decisionStateDeque) == 0 { return 0.0 } indexTemp := int64(fd.decisionStateDequeIndex) - 1 if indexTemp < 0 { indexTemp = int64(fd.decisionStateDequeSize) - 1 } decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond) beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3) if beginInMilliSecond < 0 { beginInMilliSecond = 0 } var timeSum uint32 = 0 for i := uint32(1); i < fd.decisionStateDequeSize; i++ { if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) { break } indexTemp-- if indexTemp < 0 { indexTemp = int64(fd.decisionStateDequeSize) - 1 } if decisionFlag { timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond } decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond) } activeDurationInSecond = float32(timeSum) * 1e-3 return activeDurationInSecond } /* SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签. */ func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) { bufferSize := uint32(len(buffer)) /**************************Calculate the RMS***************************/ sumTemp := int64(0) ssqTemp := int64(0) for i := uint32(0); i < bufferSize; i++ { sumTemp = sumTemp + int64(buffer[i]) ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i]) } sum := float64(sumTemp) sum /= float64(bufferSize) ssq := float64(ssqTemp) rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum))) //fmt.Printf("rms %f\n", rms) /**********************************************************************/ var decisionFlag bool if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond { decisionFlag = false } else { decisionFlag = rms > fd.threshold && rms > 400 } //fmt.Printf("decisionFlag %t\n", decisionFlag) fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag) if fd.thisFrameFlag == FrameFlagSpeechPre { if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond { fd.thisFrameFlag = FrameFlagSpeechStart } } else if fd.thisFrameFlag == FrameFlagSpeechStart { if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond { fd.thisFrameFlag = FrameFlagSpeechPresent } else { //TODO: 感觉这一部分是不会触发的吧. if fd.speechStartConfirmRequiredLengthInSecond != 0 { fd.thisFrameFlag = FrameFlagSpeechPre } } } else if fd.thisFrameFlag == FrameFlagSpeechPresent { if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond { fd.thisFrameFlag = FrameFlagSpeechEnd } } else if fd.thisFrameFlag == FrameFlagSpeechEnd { if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond { fd.thisFrameFlag = FrameFlagSpeechPre } else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond { //fd.thisFrameFlag = FrameFlagSpeechPre //我感觉这里的条件判断应该是 < 而不是 >=. //有可能他是想在这里添加一个短暂的停顿,用于添加逗号. fd.thisFrameFlag = FrameFlagSpeechPre } } // if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag { fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold) fd.adaptFactor = fd.threshold } else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent { if rms < fd.adaptFactor { fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor } else { fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor } thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold } // if fd.threshold < fd.minThreshold { fd.threshold = fd.minThreshold } // Update the Threshold if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber { alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber) fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms) } else { if rms > fd.noiseLevelValue { fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue) } else { fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue) } } if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber { if fd.noiseLevelValue > 400 { fd.noiseLevelValue = fd.noiseLevelValue * 0.1 } fd.threshold = fd.noiseLevelValue * 2 if fd.threshold < fd.minThreshold { fd.threshold = fd.minThreshold } } fd.processedFramesNumber++ } type VadEventMarkerType struct { VadFlag string Time uint32 } type StreamingVadType struct{ sampleRate uint32 //采样率 silenceTime float32 //判断语音结束时需要的静音时长 timeout float32 //单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束 timeoutInMilliSecond uint32 //VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置. frameLength uint32 //每一帖的长度 unfinishedFrame []int16 //剩余帧 unfinishedFrameSize uint32 //剩余帧长度 frameDecision FrameDecisionType // startRejectSpeechTimeInMilliSecond uint32 allowedSilenceTimeInSpeechInMilliSecond uint32 allowedLongestSpeechDurationInMilliSecond uint32 minDurationOfLongSpeechInMilliSecond uint32 endOfLongSpeechRequiredSilenceTimeInMilliSecond uint32 endOfNormalSpeechRequiredSilenceTimeInMilliSecond uint32 minDurationOfSpeechToAddCommaInMilliSecond uint32 //where to add comma if speech //只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了, //此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置. prepareDurationInMilliSecond uint32 //检测到语音结束时, 并不会马上判断语音结束, //而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它, //语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置. nonSpeechPadInInMilliSecond uint32 speechFrameGlobalTimeInMilliSecond uint32 speechDetectedStartTimeInMilliSecond uint32 speechDetectedStartTimeIsValid bool speechDetectedEndTimeInMilliSecond uint32 speechDetectedEndTimeIsValid bool speechDetectedEndTimeIsValidPossible bool speechDetectedStartAndEnd bool // lastVadEndTimeInMilliSecond uint32 thisDetectedState string //VadFlag VadEventMarkerDeque []VadEventMarkerType } /* silenceTime: 0.4 timeout: 3.0 以下条件应满足: minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond */ func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) { sv.sampleRate = sampleRate sv.timeout = timeout sv.timeoutInMilliSecond = uint32(timeout * 1e3) sv.frameLength = uint32(0.02 * float32(sampleRate)) sv.unfinishedFrameSize = 0 var params ParametersForFdType params.Init() params.SampleRate = sampleRate sv.frameDecision.Init(params) // sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3) sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3) sv.minDurationOfLongSpeechInMilliSecond = 0 sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0 sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3) sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3) sv.allowedLongestSpeechDurationInMilliSecond = 0 sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2 sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5) sv.speechFrameGlobalTimeInMilliSecond = 0 // sv.speechDetectedStartTimeInMilliSecond = 0 sv.speechDetectedStartTimeIsValid = false sv.speechDetectedEndTimeInMilliSecond = 0 sv.speechDetectedEndTimeIsValid = false sv.speechDetectedEndTimeIsValidPossible = false sv.speechDetectedStartAndEnd = false // sv.lastVadEndTimeInMilliSecond = 0 sv.thisDetectedState = VadFlagNoSpeech fmt.Println("do StreamingVad Init...") } func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) { var validBuffer []int16 //unfinished frame unfinishedFrameSize := uint32(len(sv.unfinishedFrame)) for i := uint32(0); i < unfinishedFrameSize; i++ { validBuffer = append(validBuffer, sv.unfinishedFrame[i]) } //buffer bufferSize := uint32(len(buffer)) for i := uint32(0); i < bufferSize; i++ { validBuffer = append(validBuffer, buffer[i]) } //remainder remainderSize := uint32(len(validBuffer)) % sv.frameLength boundary := uint32(len(validBuffer)) - remainderSize sv.unfinishedFrame = validBuffer[boundary:] validBuffer = validBuffer[:boundary] if uint32(len(validBuffer)) > sv.frameLength { sv.ProcessSpeech(validBuffer) } return nil } //ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用. func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) { bufferSize := uint32(len(buffer)) var validBuffer []int16 var unfinishedFrame []int16 var point int16 validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength if validSize >= sv.frameLength { if sv.unfinishedFrameSize != 0 { for i := uint32(0); i < sv.unfinishedFrameSize; i++ { point = sv.unfinishedFrame[i] validBuffer = append(validBuffer, point) } for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ { point = buffer[i] validBuffer = append(validBuffer, point) } } else { for i := uint32(0); i < validSize; i++ { point = buffer[i] validBuffer = append(validBuffer, point) } } sv.ProcessSpeech(validBuffer) } //fmt.Printf("validBuffer size: %d\n", len(validBuffer)) //fmt.Printf("validSize: %d\n", validSize) //fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize) sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize begin := bufferSize - sv.unfinishedFrameSize - 1 for i := uint32(0); i < sv.unfinishedFrameSize; i++ { point = buffer[begin + i] unfinishedFrame = append(unfinishedFrame, point) } sv.unfinishedFrame = unfinishedFrame fmt.Println("do StreamingVad ProcessSpeechByChunk...") return nil } //ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态. func (sv *StreamingVadType) ProcessSpeech(buffer []int16) { bufferLength := uint32(len(buffer)) if bufferLength % sv.frameLength != 0 { panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength)) } var frameBuffer []int16 for begin := uint32(0); begin + sv.frameLength <= bufferLength; { frameBuffer = buffer[begin: begin + sv.frameLength] sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer) begin += sv.frameLength if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent { if sv.thisDetectedState == VadFlagNoSpeech { //start var prepareTime uint32 = 0 if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond { prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond } if prepareTime < sv.lastVadEndTimeInMilliSecond { prepareTime = sv.lastVadEndTimeInMilliSecond } vadEventMarker := VadEventMarkerType{ VadFlag: VadFlagPrepare, Time: prepareTime, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) sv.thisDetectedState = VadFlagSpeaking vadEventMarker = VadEventMarkerType{ VadFlag: VadFlagSpeaking, Time: sv.speechFrameGlobalTimeInMilliSecond, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) // sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false //sv.speechDetectedEndTimeIsValidPossible = false sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) sv.speechDetectedStartTimeIsValid = true } else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond { //pause vadEventMarker := VadEventMarkerType{ VadFlag: VadFlagPause, Time: sv.speechDetectedEndTimeInMilliSecond, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) sv.thisDetectedState = VadFlagSpeaking vadEventMarker = VadEventMarkerType{ VadFlag: VadFlagSpeaking, Time: sv.speechFrameGlobalTimeInMilliSecond, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) // sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false //sv.speechDetectedEndTimeIsValidPossible = false } else if sv.thisDetectedState == VadFlagSpeaking && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond { // sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false //sv.speechDetectedEndTimeIsValidPossible = false } else {} } //end if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre { sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond sv.speechDetectedEndTimeIsValid = true //sv.speechDetectedEndTimeIsValidPossible = true } //只在开始一定时间后, 才能检测到 Vad 结束. if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond { if sv.speechDetectedEndTimeIsValid { var endOfSpeechRequiredSilenceTime uint32 if sv.minDurationOfLongSpeechInMilliSecond > 0 && sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 && (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond { endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond } else { endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond } if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime { endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false sv.speechDetectedStartTimeInMilliSecond = uint32(0) sv.speechDetectedStartTimeIsValid = false sv.thisDetectedState = VadFlagNoSpeech sv.lastVadEndTimeInMilliSecond = endTime vadEventMarker := VadEventMarkerType{ VadFlag: VadFlagNoSpeech, Time: endTime, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) } } } //当语音时长超过时, 强制切断 if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond { //end sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false sv.speechDetectedStartTimeInMilliSecond = uint32(0) sv.speechDetectedStartTimeIsValid = false sv.thisDetectedState = VadFlagNoSpeech sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond vadEventMarker := VadEventMarkerType{ VadFlag: VadFlagNoSpeech, Time: sv.speechFrameGlobalTimeInMilliSecond, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) //start var prepareTime uint32 = 0 if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond { prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond } if prepareTime < sv.lastVadEndTimeInMilliSecond { prepareTime = sv.lastVadEndTimeInMilliSecond } vadEventMarker = VadEventMarkerType{ VadFlag: VadFlagPrepare, Time: prepareTime, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) sv.thisDetectedState = VadFlagSpeaking vadEventMarker = VadEventMarkerType{ VadFlag: VadFlagSpeaking, Time: sv.speechFrameGlobalTimeInMilliSecond, } sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) // sv.speechDetectedEndTimeInMilliSecond = uint32(0) sv.speechDetectedEndTimeIsValid = false //sv.speechDetectedEndTimeIsValidPossible = false sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) sv.speechDetectedStartTimeIsValid = true } //loop sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3) } }