Golang基於chrome瀏覽器語音識別web演示系統WebHTK開發之HTML5錄音篇
學院與科大訊飛合作“廈門大學-科大訊飛閩南語語音與語言聯合實驗室”,第一階段的語音識別演示系統,可能只是簡單的閩南語孤立詞識別。現成的演示系統有去年寫的android演示程式。打算再寫個PC端的演示系統,基本的引擎已經搭建好,後續介面和資料庫方面再調整優化。再來,最近學習Golang,怎麼可以不用上呢?web版演示系統,golang(Beego框架)(後端) + HTML5(前端) + MongoDB(資料庫)。
本節,主要講解web前端的錄音工作,以及通過HTML5 websocket傳輸音訊流資料到後端並儲存。
來看下程式碼:
record.html:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
<!DOCTYPE HTML> <html lang= "en" >
<head>
<meta charset = "utf-8" />
<title>PONPON Chat by WebSockets</title>
<script type= "text/javascript" src= "/static/lib/recorder.js" > </script>
<script type= "text/javascript" src= "/static/lib/jquery-1.10.1.min.js" > </script>
<style type= `text/css` >
</style>
</head>
<body>
<audio controls autoplay></audio>
<form>
<input type= "button" id= "record" value= "錄音" >
<input type= "button" id= "export" value= "傳送" >
</form>
<div id= "message" ></div>
</body>
<script type= `text/javascript` >
//回撥函式
var onFail = function (e) {
console.log( `Rejected!` , e);
}; //回撥函式
var onSuccess = function (s) {
var context = new webkitAudioContext();
var mediaStreamSource = context.createMediaStreamSource(s);
rec = new Recorder(mediaStreamSource);
}
//window.URL = URL || window.URL || window.webkitURL;
navigator.getUserMedia = navigator.getUserMedia ||
navigator.webkitGetUserMedia || navigator.mozGetUserMedia || navigator.msGetUserMedia; var rec;
var audio = document.querySelector( `#audio` );
function startRecording() {
if (navigator.getUserMedia) {
//onSuccess, onFail分別為getUserMedia成功或失敗的回撥函式
navigator.getUserMedia({audio: true }, onSuccess, onFail);
} else {
console.log( `navigator.getUserMedia not present` );
}
}
startRecording();
//--------------------
$( `#record` ).click( function () {
rec.record();
var dd = ws.send( "start" );
$( "#message" ).text( "Click export to stop recording" );
/*
setInterval函式,看到後面3000沒?意思是週期為3000毫
秒,每過3000毫秒,執行一次前面的function,在此處就是
執行function內的rec.clear()和ws.send(blob),直到
clearInterval(intervalKey)函式出現則停止
*/
intervalKey = setInterval( function () {
rec.exportWAV( function (blob) {
rec.clear();
ws.send(blob);
//audio.src = URL.createObjectURL(blob);
});
}, 3000);
});
$( `#export` ).click( function () {
// first send the stop command
rec.stop();
ws.send( "stop" );
clearInterval(intervalKey);
$( "#message" ).text( "已傳送到伺服器!" );
});
var ws = new WebSocket( `ws://` + window.location.host + `/record/join` );
ws.onopen = function () {
console.log( "Openened connection to websocket" );
};
ws.onclose = function (){
console.log( "Close connection to websocket" );
}
ws.onerror = function (){
console.log( "Cannot connection to websocket" );
}
ws.onmessage = function (e) {
audio.src = URL.createObjectURL(e.data);
}
</script>
</html> |
這段程式碼關鍵在於navigator.getUserMedia來獲得客戶端的媒體資源。進入該頁面,將向chrome瀏覽器客戶端請求媒體資源。請求成功後:
1
2
3
4
5
6
7
8
|
//建立webkitAudio資源 var context = new webkitAudioContext();
//建立媒體流 var mediaStreamSource = context.createMediaStreamSource(s);
//錄音例項 rec = new Recorder(mediaStreamSource);
|
開始錄音,執行rec.record(),看下recorder.js:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
( function (window) {
var WORKER_PATH = `/static/lib/recorderWorker.js` ;
var Recorder = function (source, cfg) {
var config = cfg || {};
var bufferLen = config.bufferLen || 4096*2;
this .context = source.context;
/*
下面 createJavaScriptNode()中後兩個引數分別為
輸入、輸出聲道數。1指單聲道,2指多聲道
*/
this .node = this .context.createJavaScriptNode(bufferLen, 2, 2);
var worker = new Worker(config.workerPath || WORKER_PATH);
worker.postMessage({
command: `init` ,
config: {
sampleRate: this .context.sampleRate
}
});
var recording = false ,
currCallback;
this .node.onaudioprocess = function (e) {
if (!recording) return ;
worker.postMessage({
command: `record` ,
buffer: [
//獲得左聲道資料
e.inputBuffer.getChannelData(0)
,
//獲得右聲道資料
e.inputBuffer.getChannelData(1)
]
});
}
this .configure = function (cfg) {
for ( var prop in cfg) {
if (cfg.hasOwnProperty(prop)) {
config[prop] = cfg[prop];
}
}
}
this .record = function () {
recording = true ;
}
this .stop = function () {
recording = false ;
}
this .clear = function () {
worker.postMessage({
command: `clear`
});
}
this .getBuffer = function (cb) {
currCallback = cb || config.callback;
worker.postMessage({
command: `getBuffer`
})
}
this .exportWAV = function (cb, type) {
currCallback = cb || config.callback;
type = type || config.type || `audio/wav` ;
if (!currCallback) throw new Error( `Callback not set` );
worker.postMessage({
command: `exportWAV` ,
type: type
});
}
worker.onmessage = function (e) {
var blob = e.data;
currCallback(blob);
}
source.connect( this .node);
this .node.connect( this .context.destination); //this should not be necessary
};
Recorder.forceDownload = function (blob, filename) {
var url = (window.URL || window.webkitURL).createObjectURL(blob);
alert(url);
var link = window.document.createElement( `a` );
link.href = url;
link.download = filename || `output.wav` ;
var click = document.createEvent( "Event" );
click.initEvent( "click" , true , true );
link.dispatchEvent(click);
}
window.Recorder = Recorder;
})(window); |
開始錄音後,執行this
.node.onaudioprocess,從錄音緩衝去錄音samples資料,注意:
1
2
3
4
5
6
7
8
|
worker.postMessage({ command: `record` ,
buffer: [
e.inputBuffer.getChannelData(0)
,
e.inputBuffer.getChannelData(1)
]
});
|
buffer將從錄音裝置獲取兩個聲道的資料。
recorderWorker.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
var recLength = 0,
recBuffersL = [],
recBuffersR = [],
sampleRate;
this .onmessage = function (e) {
switch (e.data.command) {
case `init` :
init(e.data.config);
break ;
case `record` :
record(e.data.buffer);
break ;
case `exportWAV` :
exportWAV(e.data.type);
break ;
case `getBuffer` :
getBuffer();
break ;
case `clear` :
clear();
break ;
}
}; function init(config) {
sampleRate = config.sampleRate ;
} //從錄音裝置獲得兩個聲道的資料 function record(inputBuffer) {
recBuffersL.push(inputBuffer[0]);
recBuffersR.push(inputBuffer[1]);
recLength += inputBuffer[0].length;
} //傳送處理好的dataview資料 function exportWAV(type) {
var bufferL = mergeBuffers(recBuffersL, recLength);
var bufferR = mergeBuffers(recBuffersR, recLength);
var interleaved = interleave(bufferL , bufferR);
var dataview = encodeWAV(interleaved);
var audioBlob = new Blob([dataview], {
type: type
});
this .postMessage(audioBlob);
} //從錄音緩衝讀取資料存入傳送緩衝 function getBuffer() {
var buffers = [];
buffers.push(mergeBuffers(recBuffersL, recLength));
buffers.push( mergeBuffers(recBuffersR, recLength) );
this .postMessage(buffers);
} //清除錄音緩衝資料 function clear(inputBuffer) {
recLength = 0;
recBuffersL = [];
recBuffersR = [];
} //合併資料 function mergeBuffers(recBuffers, recLength) {
var result = new Float32Array(recLength);
var offset = 0;
for ( var i = 0; i < recBuffers.length; i++) {
result.set(recBuffers[i], offset);
offset += recBuffers[i].length;
}
return result;
} //合併交錯左右聲道資料 function interleave(inputL, inputR){
// function interleave(inputL) { var length = inputL.length + inputR.length ;
var result = new Float32Array(length);
var index = 0,
inputIndex = 0;
while (index < length) {
result[index++] = inputL[inputIndex];
result[index++] = inputR[inputIndex];
inputIndex++;
}
return result;
} //資料轉碼16bit function floatTo16BitPCM(output, offset, input) {
for ( var i = 0; i < input.length; i++, offset += 2) {
var s = Math.max(-1, Math.min(1, input[i]));
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true );
}
} function writeString(view, offset, string) {
for ( var i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
} //寫入44位 wav資料頭 function encodeWAV(samples) {
var buffer = new ArrayBuffer(44 + samples.length * 2);
var view = new DataView(buffer);
/* RIFF identifier */
writeString(view, 0, `RIFF` );
/* file length */
view.setUint32(4, 32 + samples.length * 2, true );
/* RIFF type */
writeString(view, 8, `WAVE` );
/* format chunk identifier */
writeString(view, 12, `fmt ` );
/* format chunk length */
view.setUint32(16, 16, true );
/* sample format (raw) */
view.setUint16(20, 1, true );
/* channel count */
view.setUint16(22, 2, true );
/* sample rate */
view.setUint32(24, sampleRate, true );
/* byte rate (sample rate * block align) */
view.setUint32(28, sampleRate * 4, true );
/* block align (channel count * bytes per sample) */
view.setUint16(32, 4, true );
/* bits per sample */
view.setUint16(34, 16, true );
/* data chunk identifier */
writeString(view, 36, `data` );
/* data chunk length */
view.setUint32(40, samples.length * 2, true );
floatTo16BitPCM(view, 44, samples);
return view;
} |
目前,只能錄製48000Hz 16Bit 資料。我調整了錄製引數,所需目標格式為8000Hz 16Bit Mono語音資料,但是失敗了,錄製出的資料仍然是48000Hz 16Bit。由於對前端javascript程式碼完全不瞭解,後續再來研究怎麼解決這個錄音格式的問題。
補:錄製單聲道的話,在recorder.js中修改this.context.createJavaScriptNode(bufferLen, 1, 1),在recorderWorker.js中把右聲道的資料都砍掉就ok了。
再回頭看record.html中:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
//進入頁面伺服器傳送websocket握手請求 var ws = new WebSocket( `ws://` + window.location.host + `/record/join` );
//握手成功
ws.onopen = function () {
console.log( "Openened connection to websocket" );
};
//斷開連線
ws.onclose = function (){
console.log( "Close connection to websocket" );
}
//握手失敗
ws.onerror = function (){
console.log( "Cannot connection to websocket" );
}
|
每次重新整理登入該頁面,客戶端就會向伺服器傳送websocket握手請求,握手成功後,js程式碼中錄好音之後 將ws.send(資料)對應到button上,點選按鈕就可傳送資料了。
golang beego框架後端怎麼來處理資料呢?在頁面對應的controllers上的程式碼上定義controller的join方法,程式碼較為簡陋,初步實現功能,後續加上channel等來完善:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
package controllers
import (
"bufio"
"github.com/astaxie/beego"
"github.com/garyburd/go-websocket/websocket"
"net/http"
"os"
"path"
"strings"
) type RecordController struct { beego.Controller
} func ( this *RecordController) Join() {
//獲取請求端的IP地址
remoteAddr := strings.Split( this .Ctx.Request.RemoteAddr, ":" )[ 0 ]
mlogger.i( "Reciving Record Data From Host: " + remoteAddr)
//獲取websocket的連線例項
ws, err := websocket.Upgrade( this .Ctx.ResponseWriter, this .Ctx.Request.Header, nil, 1024 , 1024 )
if _, ok := err.(websocket.HandshakeError); ok {
http.Error( this .Ctx.ResponseWriter, "Not a websocket handshake" , 400 )
return
} else if err != nil {
beego.Error( "Cannot setup WebSocket connection:" , err)
return
}
//以IP地址作為儲存wav檔案的檔名
wavName := "record/" + remoteAddr + ".wav"
os.MkdirAll(path.Dir(wavName), os.ModePerm)
_, e := os.Stat(wavName)
if e == nil {
//刪除已有wav檔案
os.Remove(wavName)
}
f, err := os.Create(wavName)
mlogger.i( "Host: " + remoteAddr + " creating file handler ..." )
defer f.Close()
if err != nil {
mlogger.e(err)
return
}
w := bufio.NewWriter(f)
for {
//從websocket上讀取資料流
_, p, err := ws.ReadMessage()
if err != nil {
mlogger.i( "Host: " + remoteAddr + " disconnected ..." )
break
}
length := len(p)
if length == 4 || length == 5 {
//length == 4,說明在web上傳送ws.send(`stop`)
//length == 5,說明在web上傳送ws.send(`start`)
action := string(p)
mlogger.i( "Client`s action: " + action + " recording !" )
if action == "stop" {
goto SAVE
} else {
goto RESTART
}
}
w.Write(p)
continue
SAVE: mlogger.i( "Host: " + remoteAddr + " saving wav file wav ..." )
w.Flush()
mlogger.i( "Host: " + remoteAddr + " flushing writer ..." )
f.Close()
mlogger.i( "Host: " + remoteAddr + " closing the file handler ..." )
continue
RESTART: os.Remove(wavName)
f, err = os.Create(wavName)
mlogger.i( "Host: " + remoteAddr + " creating file handler ..." )
// defer f.Close()
if err != nil {
mlogger.e(err)
return
}
w = bufio.NewWriter(f)
}
return
} |
在路由設定上:
1
2
|
beego.Router( "/record" , &controllers.RecordController{})
beego.Router( "/record/join" , &controllers.RecordController{}, "get:Join" )
|
補:
注意到在record.html中:
1
2
3
4
5
6
|
intervalKey = setInterval( function () {
rec.exportWAV( function (blob) {
rec.clear(); ws.send(blob); }); }, 3000); |
setInterval函式中function裡ws.send(blob)每過3秒就往伺服器傳送blob資料,在 recorderWorker.js中的encordWAV函式中,往裸語音資料資料加44位wav頭資料,而資料的長度一直是本週期內所錄語音資料的長度,這就會出現,最後在伺服器儲存了3秒以上的資料,但是讀到的wav頭中關於資料長度的值則只有3秒或3秒以內。並且,每次都往資料wav頭也是不對的,44位wav並不是有效的語音資料。所以在recorderWorker.js中應修改encordWAV程式碼:
1
2
3
4
5
6
|
function encodeWAV(samples) {
var buffer = new ArrayBuffer(samples.length * 2);
var view = new DataView(buffer);
floatTo16BitPCM(view, 0, samples);
return view;
} |
這樣就直接往伺服器傳輸裸語音資料流,在record.html上點選傳送按鈕的事件函式裡,新增
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
$( `#export` ).click( function () {
rec.stop();
if (intervalKey== null ) {
$( "#message" ).text( "請先錄音再傳送!" );
return
};
ws.send(sampleRate);
ws.send(channels);
console.log( `sampleRate:` +sampleRate+ `,channels:` +channels);
ws.send( "stop" );
rec.clear();
clearInterval(intervalKey);
$( "#message" ).text( "已傳送到伺服器!" );
});
|
伺服器就能收到收到資料的samplerate取樣率,channels聲道數。相應的在golang伺服器程式碼join方法中,新增寫44位wav頭的程式碼,把這資料頭寫在裸語音資料快取的最前端並儲存wav檔案即可:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
type wavHeader [] byte
//wav 44位檔案頭 func SetHeader(sampleRate int , channel int , length uint32) (header wavHeader) {
header = make([] byte , 44 )
chunkSize := length + 36
header[ 0 ] = `R`
header[ 1 ] = `I`
header[ 2 ] = `F`
header[ 3 ] = `F`
header[ 4 ] = byte (chunkSize & 0xff )
header[ 5 ] = byte ((chunkSize >> 8 ) & 0xff )
header[ 6 ] = byte ((chunkSize >> 16 ) & 0xff )
header[ 7 ] = byte ((chunkSize >> 24 ) & 0xff )
header[ 8 ] = `W`
header[ 9 ] = `A`
header[ 10 ] = `V`
header[ 11 ] = `E`
header[ 12 ] = `f`
header[ 13 ] = `m`
header[ 14 ] = `t`
header[ 15 ] = ` `
header[ 16 ] = 16
header[ 17 ] = 0
header[ 18 ] = 0
header[ 19 ] = 0
header[ 20 ] = 1
header[ 21 ] = 0
header[ 22 ] = byte (channel & 0xff ) //1 or 2
header[ 23 ] = 0
header[ 24 ] = byte (sampleRate & 0xff ) //64 8000
header[ 25 ] = byte ((sampleRate >> 8 ) & 0xff ) //31 8000
header[ 26 ] = byte ((sampleRate >> 16 ) & 0xff ) //0
header[ 27 ] = byte ((sampleRate >> 24 ) & 0xff ) //0
header[ 28 ] = byte ((sampleRate * 2 * channel) & 0xff ) //128 800
header[ 29 ] = byte ((sampleRate * 2 * channel) >> 8 & 0xff ) //62
header[ 30 ] = byte ((sampleRate * 2 * channel) >> 16 & 0xff ) //0
header[ 31 ] = byte ((sampleRate * 2 * channel) >> 24 & 0xff ) //0
header[ 32 ] = byte ((channel * 2 ) & 0xff ) //2 or 4
header[ 33 ] = 0
header[ 34 ] = 16
header[ 35 ] = 0
header[ 36 ] = `d`
header[ 37 ] = `a`
header[ 38 ] = `t`
header[ 39 ] = `a`
header[ 40 ] = byte (length & 0xff )
header[ 41 ] = byte ((length >> 8 ) & 0xff )
header[ 42 ] = byte ((length >> 16 ) & 0xff )
header[ 43 ] = byte ((length >> 24 ) & 0xff )
return
} |
如果需要去除語音的靜音部分,參考我的github:github.com/liuxp0827/waveIO。最新的waveIO包沒來得及上傳,對delSilence函式做下修改即可。
完整程式碼,請瀏覽附件:http://down.51cto.com/data/1092540。
這樣,從前端錄音,到websocket傳輸資料,再到beego後端讀寫資料到伺服器本地就可實現了
相關文章
- 基於語音識別的會議記錄系統
- 語音識別--kaldi環境搭建(基於Ubuntu系統)Ubuntu
- 基於.net開發chrome核心瀏覽器【五】Chrome瀏覽器
- 基於.net開發chrome核心瀏覽器【六】Chrome瀏覽器
- Win10系統中Chrome瀏覽器沒有聲音如何解決Win10Chrome瀏覽器
- 5 款不錯的開源語音識別/語音文字轉換系統
- Chrome瀏覽器擴充套件開發系列之九:Chrome瀏覽器的chrome.alarms.* APIChrome瀏覽器套件API
- 瀏覽器語音桌面通知,Notification API瀏覽器API
- Chrome瀏覽器擴充套件開發系列之四:Browser Action型別的Chrome瀏覽器擴充套件Chrome瀏覽器套件型別
- Chrome瀏覽器擴充套件開發系列之五:Page Action型別的Chrome瀏覽器擴充套件Chrome瀏覽器套件型別
- 語音直播系統原始碼開發語音直播系統部署搭建原始碼
- 在 .NET 中開發基於 Chrome 核心的瀏覽器-建立一個簡單瀏覽器Chrome瀏覽器
- 語音識別進入IVR系統 (轉)VR
- 基於Linux系統的語音卡開發平臺(轉)Linux
- Kaldi搭建語音識別系統—發音詞典相關檔案準備
- 深度瞭解語音識別之發音詞典及語音資料採集標註
- Transformers.js實現瀏覽器內WebGPU加速的實時語音識別ORMJS瀏覽器WebGPU
- Win7系統怎麼開啟語音識別功能Win7
- 語音識別開源專案
- Chrome瀏覽器擴充套件開發系列之十六:擴充套件中可用的Chrome瀏覽器APIChrome瀏覽器套件API
- 人工智慧之語音識別(ASR)人工智慧
- 谷歌承認在偷錄使用者日常對話!為了語音識別系統的開發谷歌
- Windows10系統如何禁用語音識別功能Windows
- Win10系統如何關閉語音識別Win10
- PocketSphinx語音識別系統的安裝和使用
- 基於CefSharp開發瀏覽器(八)瀏覽器收藏夾欄瀏覽器
- 工具篇---Chrome瀏覽器快捷鍵Chrome瀏覽器
- Win10系統如何啟動語音識別 win10啟動語音識別的兩種方法Win10
- JavaScript的語音識別JavaScript
- 語音識別技術
- 基於react的錄音及音訊曲線繪製的元件開發React音訊元件
- 微信開發之錄音檔案
- 4┃音視訊直播系統之瀏覽器中通過 WebRTC 進行桌面共享瀏覽器Web
- Win10系統如何開啟Chrome瀏覽器黑暗模式Win10Chrome瀏覽器模式
- 替代Edge瀏覽器?微軟開發新的瀏覽器:採用Chrome核心瀏覽器微軟Chrome
- 樹莓派語音互動--語音輸入識別樹莓派
- appMobi推出基於HTML5的瀏覽器APPHTML瀏覽器
- win10系統下語音識別打不開的解決方法Win10