JavaScript正則中g標誌
緣起
有一天在思否社群看到有個問題,大致描述如下
const list = ['a', 'b', '-', 'c', 'd'];
const reg = /[a-z]/g;
const letters = list.filter(i => reg.test(i));
// letters === ['a', 'c'];
// 如果正則不使用`g`標誌可以得到所有的字母
// 為什麼加入`g`之後就不可以了
對問題而言,遍歷中的i
就是一個字元,不需要用到g
。
但是就我對正則的理解(過於淺薄)感覺上有沒有g
(只是全域性搜尋,不會匹配到就停下來)應該不影響,激發了我的好奇心。
上面題的建議寫法如下
const reg = /[a-z]/g;
reg.test('a'); // => true
reg.test('a'); // => false
reg.test('a'); // => true
reg.test('a'); // => false
reg.test('a'); // => true
解密過程
首先可以確定的表現一定是g
導致的
搜尋引擎
開啟 MDN 仔細檢視g
標誌的作用,得到結論和我的理解無二。
我猜想應該就是g
可能啟用了某種快取,又因為reg
相對過濾器是全域性變數,我將程式碼改為:
const list = ['a', 'b', '-', 'c', 'd'];
const letters = list.filter(i => /[a-z]/g.test(i));
// letters === ['a', 'b', 'c', 'd'];
將正則宣告到每一次遍歷,得到結論就是正確的,驗證了我的猜想。也得到了,快取就是正則中的某個地方
下面我找到對應的原始碼來檢視問題的原因
原始碼層面
由於最近在看 Rust,所以使用 Rust 編寫的原始碼檢視 https://github/boa-dev/boa
開啟專案後,點選.
進入 vscode 模式,command+p 搜尋 regexp 關鍵詞
進入test.rs
檔案,command+f 搜尋/g
可以找到在 90 行有個last_index()
的測試
#[test]
fn last_index() {
let mut context = Context::default();
let init = r#"
var regex = /[0-9]+(\.[0-9]+)?/g;
"#;
// forward 的作用:更改 context,並返回結果的字串。
eprintln!("{}", forward(&mut context, init));
assert_eq!(forward(&mut context, "regex.lastIndex"), "0");
assert_eq!(forward(&mut context, "regex.test('1.0foo')"), "true");
assert_eq!(forward(&mut context, "regex.lastIndex"), "3");
assert_eq!(forward(&mut context, "regex.test('1.0foo')"), "false");
assert_eq!(forward(&mut context, "regex.lastIndex"), "0");
}
看到了有lastIndex
關鍵字,這裡再已經大致猜到問題的原因了,g 標誌存在匹配後的最後一個下標,導致出現問題。
我們將視線移入到mod.rs
檔案中,搜尋test
在 631 行看到了fn test()
方法
pub(crate) fn test(
this: &JsValue,
args: &[JsValue],
context: &mut Context,
) -> JsResult<JsValue> {
// 1. Let R be the this value.
// 2. If Type(R) is not Object, throw a TypeError exception.
let this = this.as_object().ok_or_else(|| {
context
.construct_type_error("RegExp.prototype.test method called on incompatible value")
})?;
// 3. Let string be ? ToString(S).
let arg_str = args
.get(0)
.cloned()
.unwrap_or_default()
.to_string(context)?;
// 4. Let match be ? RegExpExec(R, string).
let m = Self::abstract_exec(this, arg_str, context)?;
// 5. If match is not null, return true; else return false.
if m.is_some() {
Ok(JsValue::new(true))
} else {
Ok(JsValue::new(false))
}
}
在test()
方法中找到了Self::abstract_exec()
方法
pub(crate) fn abstract_exec(
this: &JsObject,
input: JsString,
context: &mut Context,
) -> JsResult<Option<JsObject>> {
// 1. Assert: Type(R) is Object.
// 2. Assert: Type(S) is String.
// 3. Let exec be ? Get(R, "exec").
let exec = this.get("exec", context)?;
// 4. If IsCallable(exec) is true, then
if let Some(exec) = exec.as_callable() {
// a. Let result be ? Call(exec, R, « S »).
let result = exec.call(&this.clone().into(), &[input.into()], context)?;
// b. If Type(result) is neither Object nor Null, throw a TypeError exception.
if !result.is_object() && !result.is_null() {
return context.throw_type_error("regexp exec returned neither object nor null");
}
// c. Return result.
return Ok(result.as_object().cloned());
}
// 5. Perform ? RequireInternalSlot(R, [[RegExpMatcher]]).
if !this.is_regexp() {
return context.throw_type_error("RegExpExec called with invalid value");
}
// 6. Return ? RegExpBuiltinExec(R, S).
Self::abstract_builtin_exec(this, &input, context)
}
又在Self::abstract_exec()
方法中找到了Self::abstract_builtin_exec()
方法
pub(crate) fn abstract_builtin_exec(
this: &JsObject,
input: &JsString,
context: &mut Context,
) -> JsResult<Option<JsObject>> {
// 1. Assert: R is an initialized RegExp instance.
let rx = {
let obj = this.borrow();
if let Some(rx) = obj.as_regexp() {
rx.clone()
} else {
return context.throw_type_error("RegExpBuiltinExec called with invalid value");
}
};
// 2. Assert: Type(S) is String.
// 3. Let length be the number of code units in S.
let length = input.encode_utf16().count();
// 4. Let lastIndex be ℝ(? ToLength(? Get(R, "lastIndex"))).
let mut last_index = this.get("lastIndex", context)?.to_length(context)?;
// 5. Let flags be R.[[OriginalFlags]].
let flags = &rx.original_flags;
// 6. If flags contains "g", let global be true; else let global be false.
let global = flags.contains('g');
// 7. If flags contains "y", let sticky be true; else let sticky be false.
let sticky = flags.contains('y');
// 8. If global is false and sticky is false, set lastIndex to 0.
if !global && !sticky {
last_index = 0;
}
// 9. Let matcher be R.[[RegExpMatcher]].
let matcher = &rx.matcher;
// 10. If flags contains "u", let fullUnicode be true; else let fullUnicode be false.
let unicode = flags.contains('u');
// 11. Let matchSucceeded be false.
// 12. Repeat, while matchSucceeded is false,
let match_value = loop {
// a. If lastIndex > length, then
if last_index > length {
// i. If global is true or sticky is true, then
if global || sticky {
// 1. Perform ? Set(R, "lastIndex", +0?, true).
this.set("lastIndex", 0, true, context)?;
}
// ii. Return null.
return Ok(None);
}
// b. Let r be matcher(S, lastIndex).
// Check if last_index is a valid utf8 index into input.
let last_byte_index = match String::from_utf16(
&input.encode_utf16().take(last_index).collect::<Vec<u16>>(),
) {
Ok(s) => s.len(),
Err(_) => {
return context
.throw_type_error("Failed to get byte index from utf16 encoded string")
}
};
let r = matcher.find_from(input, last_byte_index).next();
match r {
// c. If r is failure, then
None => {
// i. If sticky is true, then
if sticky {
// 1. Perform ? Set(R, "lastIndex", +0?, true).
this.set("lastIndex", 0, true, context)?;
// 2. Return null.
return Ok(None);
}
// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, unicode);
}
Some(m) => {
// c. If r is failure, then
#[allow(clippy::if_not_else)]
if m.start() != last_index {
// i. If sticky is true, then
if sticky {
// 1. Perform ? Set(R, "lastIndex", +0?, true).
this.set("lastIndex", 0, true, context)?;
// 2. Return null.
return Ok(None);
}
// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, unicode);
// d. Else,
} else {
//i. Assert: r is a State.
//ii. Set matchSucceeded to true.
break m;
}
}
}
};
// 13. Let e be r's endIndex value.
let mut e = match_value.end();
// 14. If fullUnicode is true, then
if unicode {
// e is an index into the Input character list, derived from S, matched by matcher.
// Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
// If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// b. Set e to eUTF.
e = input.split_at(e).0.encode_utf16().count();
}
// 15. If global is true or sticky is true, then
if global || sticky {
// a. Perform ? Set(R, "lastIndex", ?(e), true).
this.set("lastIndex", e, true, context)?;
}
// 16. Let n be the number of elements in r's captures List. (This is the same value as 22.2.2.1's NcapturingParens.)
let n = match_value.captures.len();
// 17. Assert: n < 23^2 - 1.
debug_assert!(n < 23usize.pow(2) - 1);
// 18. Let A be ! ArrayCreate(n + 1).
// 19. Assert: The mathematical value of A's "length" property is n + 1.
let a = Array::array_create(n + 1, None, context)?;
// 20. Perform ! CreateDataPropertyOrThrow(A, "index", ?(lastIndex)).
a.create_data_property_or_throw("index", match_value.start(), context)
.expect("this CreateDataPropertyOrThrow call must not fail");
// 21. Perform ! CreateDataPropertyOrThrow(A, "input", S).
a.create_data_property_or_throw("input", input.clone(), context)
.expect("this CreateDataPropertyOrThrow call must not fail");
// 22. Let matchedSubstr be the substring of S from lastIndex to e.
let matched_substr = if let Some(s) = input.get(match_value.range()) {
s
} else {
""
};
// 23. Perform ! CreateDataPropertyOrThrow(A, "0", matchedSubstr).
a.create_data_property_or_throw(0, matched_substr, context)
.expect("this CreateDataPropertyOrThrow call must not fail");
// 24. If R contains any GroupName, then
// 25. Else,
let named_groups = match_value.named_groups();
let groups = if named_groups.clone().count() > 0 {
// a. Let groups be ! OrdinaryObjectCreate(null).
let groups = JsValue::from(JsObject::empty());
// Perform 27.f here
// f. If the ith capture of R was defined with a GroupName, then
// i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName.
// ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue).
for (name, range) in named_groups {
if let Some(range) = range {
let value = if let Some(s) = input.get(range.clone()) {
s
} else {
""
};
groups
.to_object(context)?
.create_data_property_or_throw(name, value, context)
.expect("this CreateDataPropertyOrThrow call must not fail");
}
}
groups
} else {
// a. Let groups be undefined.
JsValue::undefined()
};
// 26. Perform ! CreateDataPropertyOrThrow(A, "groups", groups).
a.create_data_property_or_throw("groups", groups, context)
.expect("this CreateDataPropertyOrThrow call must not fail");
// 27. For each integer i such that i ≥ 1 and i ≤ n, in ascending order, do
for i in 1..=n {
// a. Let captureI be ith element of r's captures List.
let capture = match_value.group(i);
let captured_value = match capture {
// b. If captureI is undefined, let capturedValue be undefined.
None => JsValue::undefined(),
// c. Else if fullUnicode is true, then
// d. Else,
Some(range) => {
if let Some(s) = input.get(range) {
s.into()
} else {
"".into()
}
}
};
// e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(?(i)), capturedValue).
a.create_data_property_or_throw(i, captured_value, context)
.expect("this CreateDataPropertyOrThrow call must not fail");
}
// 28. Return A.
Ok(Some(a))
}
Self::abstract_builtin_exec()
方法中存在global
以及last_index
這樣看來最終執行的方法就是在這裡了,仔細檢視該方法中的程式碼(程式碼寫的很詳細而且每一步都有註釋)
在第 12 步中:
- lastIndex 超過文字長度且當 global 存在時將 lastIndex 置為 0
獲取匹配到的值(
match_value
)- 如果未匹配到則置為
advance_string_index()
方法的返回值 advance_string_index()
不在當前問題的考慮範圍 https://tc39.es/ecma262/#sec-...
- 如果未匹配到則置為
第 13 步獲取匹配到的值的 endIndex
第 15 步將 lastIndex 置為 endIndex
至此也就整明白了g
標誌的含義,在正則的原型鏈中存在一個lastIndex
,如果匹配為真時lastIndex
不會重置為 0 ,下一次開始時繼承了上次位置,
結論
在問題程式碼中分析
const reg = /[a-z]/g; // 宣告後,lastIndex 為 0
reg.test('a'); // => true;第一次匹配後,lastIndex 為 1
reg.test('a'); // => false;第二次匹配由於 lastIndex 為 1,且字元只有一個,得到 false,將 lastIndex 置為 0
reg.test('a'); // => true;下面依次迴圈前兩次的邏輯
reg.test('a'); // => false;
reg.test('a'); // => true;