正则表达式

正则，熟悉的陌生人，经常用，却很少总结

一、概念

正则表达式是描述字符模式的对象。在很多文本编辑器里，正则表达式通常被用来检索、替换那些匹配某个模式的文本。

三种生成方式：

const reg = eval(‘/hello/i’) - 不推荐
const reg = /hello/i
const reg = new RegExp(‘hello’, ‘i’)

正则可视化网站：https://regexper.com/

二、修饰符

修饰符与其他语法特殊，字面量方法声名的时候放到//后，构造函数声明的时候，作为第二个参数传入。整个正则表达式可以理解为正则表达式规则字符串+修饰符。

g：global 执行一个全局匹配
s：dotAll模式：可以让点（.）符号匹配任意单个字符
i：ignore case执行一个不区分大小写的匹配
u：unicode模式：正确处理大于\uFFFF的Unicode字符
m: multiple lines多行匹配，仅有换行符的时候生效
y：sticky模式：确保匹配必须从剩余的第一个未知开始

常用的 igm 就不多做解释了，说说不常见的 umy ~

dotAll模式 - s修饰符

s修饰符，可以让点（.）符号匹配任意单个字符

let se = /foo.bar/
console.log('without s', se.test('foo\nbar')) // false
se = /foo.bar/s
console.log('with s', se.test('foo\nbar')) // true

sticky模式 - y修饰符

y修饰符的作用与g修饰符类似，不同的地方，g修饰符只要剩余未知中存在匹配就可以；而y修饰符确保匹配必须从剩余的第一个未知开始，这也就是“粘连”的涵义

let str = 'aaa_aa_a'
let ye = /a+/g
let ye2 = /a+/y
console.log('ye', ye.exec(str)); // aaa
console.log('ye2', ye2.exec(str)); // aaa
console.log('ye', ye.exec(str)); // aa
console.log('ye2', ye2.exec(str)); // null

unicode模式 - u修饰符

用来正确处理大于\uFFFF的Unicode字符；\uD83D\uDC2A -> 🐪 是一个四个字节的UTF-16编码

1 2	/\uD83D/u.test('\uD83D\uDC2A') // false /\uD83D/.test('\uD83D\uDC2A') // true

扩展：常用汉字的unicode编码范围 \u4E00（一）- \u9FA5（龥） https://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php

三、方法和属性

let reg = new RegExp('hello', 'ig');
// 常用方法
console.log('test:', reg.test('hello world')); // true
console.log('lastIndex', reg.lastIndex); // 5
console.log('exec:', reg.exec('hello , i am from Hello World')); // ['Hello']
console.log('lastIndex', reg.lastIndex); // 23
// 常用属性
console.log('source:', reg.source); // hello
console.log('flags:', reg.flags); // gi
console.log('global:', reg.global); // true
console.log('ignoreCase:', reg.ignoreCase); // true
console.log('multiline:', reg.multiline); // false
// 不常用的方法和属性
console.log('dotAll:', reg.dotAll); // false
console.log('sticky:', reg.sticky); // false
console.log('unicode:', reg.unicode); // false
reg.compile('world', 'i'); // 重新编译成 /world/i

3.1: lastIndex

返回一个整数，表示下一次开始搜索的位置。该属性可读写，但是只在进行连续搜索时有意义（/g 或 /s）

const reg = /a+/g
const str = 'aaa_aa'
console.log(reg.exec(str)) // aaa
console.log(reg.lastIndex) // 3
console.log(reg.exec(str)) // aa
console.log(reg.lastIndex) // 6
console.log(reg.exec(str)) // null
console.log(reg.lastIndex) // 0
// 第二圈，从头来
console.log(reg.exec(str)) // aaa
console.log(reg.lastIndex) // 3

注意：带有g修饰符时，正则表达式内部会记住上一次的lastIndex属性，这时不应该更换所要匹配的字符串

// 带有g修饰符时，正则表达式内部会记住上一次的lastIndex属性，这时不应该更换所要匹配的字符串
const r = /aa/g
console.log(r.test('aa')) // true
console.log(r.lastIndex) // 2
console.log(r.test('_aa_')) // false
r.lastIndex = 1 // 将lastIndex手动改成1
console.log(r.test('_aa_')) // true

3.2: STRING与正则相关的方法 match、replace、search、split

1 match 返回一个数组，成员是所有匹配的子字符串；匹配失败返回null

let str = '_x_x'
console.log(str.match(/x/)) // ['x', index: 1, input: '_x_x']
console.log(str.match(/x/g)) // ['x', 'x']
console.log(str.match(/y/)) // null
// 设置正则表达式的lastIndex属性，对match方法无效
let reg = /x/g
reg.lastIndex = 5
console.log(reg.exec(str)) // null
console.log(str.match(reg)) // ['x', 'x']

2 search 返回第一个满足条件的匹配结果在整个字符串中的位置
1
2
console.log('_x_x'.search(/x/)) // 1
console.log('_x_x'.search(/y/)) // -1

3 split 按照正则规则分割字符串，返回一个由分割后的各个部分组成的数组

console.log('a,  b,c, d'.split(/, */)) // [ 'a', 'b', 'c', 'd' ]
// 第二个参数是返回数组的最大成员数
console.log('a,  b,c, d'.split(/, */, 2)) // [ 'a', 'b' ]
console.log('aaa**a*'.split(/a*/)) // [ '', '*', '*', '*' ]
// 如果正则表达式带有括号，则括号匹配的部分也会作为数组成员返回
console.log('aaa*a*'.split(/(a*)/)) // ["", "aaa", "*", "", "*", "a", "*"]

4 replace - 可以替换匹配的值

console.log('aaa'.replace(/a/, 'b')) // baa
console.log('aaa'.replace(/a/g, 'b')) // bbb

// 一、replace方法的第二个参数可以使用美元符号$，用来指代所替换的内容
// 1、$&：匹配的子字符串
console.log('abc'.replace(/b/, '$&')) // abc
// 2、$`：匹配结果前面的文本
console.log('abc'.replace(/b/, '$`')) // aac
// 3、$'：匹配结果后面的文本
console.log('abc'.replace(/b/, '$\'')) // acc
// 4、$n：匹配成功的第n组内容，n是从1开始的自然数
console.log('abc'.replace(/(b)/, '$1d')) // abdc
// 5、$$：指代美元符号$
console.log('abc'.replace(/b/, '$$')) // a$c

// 二、replace方法的第二个参数还可以是一个函数
const res = '3 and 5'.replace(/[0-9]+/g, (match) => {
  return 2 * match
})
console.log(res) // 6 and 10

四、语法

原义字符(字面量字符)
元字符(点字符.、非打印字符、预定义类、边界)
转义符\
字符类[]
字符类取反[^]
范围类[-]
选择符|
量词
贪婪与懒惰(量词后加?)
分组()与反向引用(非捕获组 ?:)
零宽断言（前瞻(?= 先行断言、?! 先行否定断言) 后顾(?<= 后行断言、?<! 后行否定断言)）

4.1 原义字符(字面量字符)

如果某个字符只表示它字面的含义，那么它们就叫做“字面量字符”（literal characters）

1	/dog/.test('IT dog') // dog为原义字符

4.2 元字符

有一部分字符有特殊含义，不代表字面的意思。它们叫做“元字符”（metacharacters）

// 1、点字符（.）- 除了换行(\n)、行结束符(\u2028)、回车(\r)、段分隔符(\u2029)外的其他任意单字符
console.log(/c..t/.test('c\nt')) // false
console.log(/c.t/.test('cat')) // true
// 2、非打印字符
// \cx：control + 控制字符(只能是字母)，如：\cM = \r
console.log(/a\cMb/.test('a\rb'), /a\cMb/.test('a\cMb')) // true false
// [\b]：匹配退格键(\u0008) 或 其本身
console.log(/[\b]/.test('\u0008'), /[\b]/.test('[\b]')) // true true
// \n：换行符，等价于 \x0a 和 本身
console.log(/a\nb/.test('a\nb'), /a\nb/.test('a\x0ab')) // true true
// \r：回车符，等价于 \x0d 和 本身
console.log(/a\rb/.test('a\rb'), /a\rb/.test('a\x0db')) // true true
// \t：制表符，等价于 \x09 和 本身
console.log(/a\tb/.test('a\tb'), /a\tb/.test('a\x09b')) // true true
// \v：垂直制表符，等价于 \x0b 和 本身
console.log(/a\vb/.test('a\vb'), /a\vb/.test('a\x0bb')) // true true
// \f：换页符，等价于 \x0c 和 本身
console.log(/a\fb/.test('a\fb'), /a\fb/.test('a\x0cb')) // true true
// \0：匹配null字符（\u0000）和 本身
console.log(/a\0b/.test('a\0b'), /a\0b/.test('a\u0000b')) // true true
// \xxx：查找以八进制数xxx规定的字符（如：\127 = W）
console.log(/\127/.test('hello \127'), /\127/.test('hello World')) // true true
// \xdd：查找以16进制数dd规定的字符（如：\x57 = W）
console.log(/\x57/.test('hello \x57'), /\x57/.test('hello World')) // true true
// \uhhhh：匹配一个以四位十六进制数（\u0000-\uFFFF）表示的字符（如：\u0057 = W）
console.log(/\u0057/.test('hello \u0057'), /\u0057/.test('hello World')) // true true
// 3、预定义类：是某些常见模式的简写方式
// \d：数字字符，等价于[0-9]
console.log(/\d/.test('123')) // true
// \D：非数字，等价于[^0-9]
console.log(/\D/.test('123')) // false
// \w：单词字符（字母、数字、下划线），等价于[0-9_a-zA-Z]
console.log(/\w/.test('1'), /\w/.test('a'), /\w/.test('_')) // true true true
// \W：非单词字符，等价于[^0-9_a-zA-Z]
console.log(/\W/.test('1'), /\W/.test('a'), /\W/.test('_')) // false false false
// \s：空白符，等价于[\t\n\x0B\f\r]
console.log(/\s/.test(' '), /\s/.test('\t')) // true true
// \S：非空白符，等价于[^\t\n\x0B\f\r]
console.log(/\S/.test(' '), /\S/.test('\t')) // false false
// 4、边界
// ^：以..开头
console.log(/^hello/.test('hello world'), /^hello/.test(' hello world')) // true false
// &：以..结尾
console.log(/world$/.test('hello world'), /world$/.test('hello world ')) // true false
// \b：匹配一个单词边界，也就是指单词和空格间的位置
console.log(/\bworld/.test('helloworld'), /\bworld/.test('hello world')) // false true
// 注："hello world"，"hello-world"，"hello&world","hello-world","hello world233"都会返回true
// 注：可以是"hello_world"，"hello2world"，"helloaworld"，"helloworld"都返回false
// \B：匹配非单词边界
console.log(/\Bworld/.test('helloworld'), /\Bworld/.test('hello world')) // true false

4.3 转义符\

对于那些有特殊含义的字符，如果要匹配它们本身，就需要在它们前面要加上反斜杠。

1
2
3

// 需要转义的，一共12个字符 ^.[$()|*+?{\
// 注：用 new RegExp 生成的需要转义的正则，需要两个\\
console.log((new RegExp('1\\+1')).test('1+1')) // true

4.4 字符类 []

表示有一系列字符可供选择，只要匹配其中一个就可以了

1
2
3

console.log(/[abc]/.test('hello')) // false
console.log(/[abc]/.test('apple')) // true
// 常见的 [12345]  [a-zA-Z0-9]

4.5 字符类取反[^] - 也叫”脱字符”

表示除了字符类之中的字符，其他字符都可以匹配 - 中括号内第一个为^

console.log(/[^abc]/.test('hello')) // true
console.log(/[^abc]/.test('aabbcc')) // false
// 扩展：[^] 表示任意字符，跟 . 相比，也包括换行符
console.log(/[^]/.test('abc'), /[^]/.test('\n'), /[^]/.test('$')) // true true true
console.log(/./.test('abc'), /./.test('\n'), /./.test('$')) // true false true

4.6 范围类[-] - 也叫”连字符”

表示一定范围内

console.log(/[0-9]/.test('123')) // true
console.log(/[a-z]/.test('abc')) // true
console.log(/[a-zA-Z_0-9]/.test('a_b1')) // true
console.log(/[\u4E00-\u9FA5]/.test('你hello')) // true

4.7 选择符|

表示“或关系”（OR）

console.log(/cat|dog/.test('cat'), /cat|dog/.test('at')) // true false
console.log(/cat|dog|fish/.test('cat')) // true
// 选择符会包括它前后的多个字符，比如上面的cat、dog、fish，可用()改变这一行为
console.log(/(at|do)g/.test('atg')) // true

4.8 量词 ?*+{m,n}，用于限定子模式出现在正则表达式的次数

// 1、匹配n次：{n}
console.log(/a{2}/.test('abc'), /a{2}/.test('aabc')) // false true
// 2、匹配至少m次，最多n次：{m,n}
console.log(/a{1,3}/.test('baaaac')) // true
// 3、匹配至少m次，最多不限：{m,}
console.log(/a{2,}/.test('abc'), /a{2,}/.test('aaaabc')) // false true
// 4、匹配0次或多次，相当于{0,}：*
console.log(/a*/.test('bc'), /a*/.test('abc')) // true true
// 5、匹配0次或1次，相当于{0,1}：?
console.log(/https?/.test('http://'), /https?/.test('https://')) // true true
// 6、匹配一次或多次，相当于{1,}：+
console.log(/a+/.test('bc'), /a+/.test('aaaabc')) // false true

4.9 贪婪与懒惰(量词后加?)

上面的量词符，默认情况下都是最大可能匹配，即匹配到下一个字符不满足匹配规则为止，这被称为贪婪模式
要开启懒惰模式，需要在量词后加?
贪婪模式量词： {x,y} {x,} ? * +

懒惰模式量词： {x,y}? {x,}? ?? *? +?

const str = 'baaaac'
console.log(str.match(/ba{1,4}/), str.match(/ba{1,4}?/)) // ['baaaa']  ['ba']
console.log(str.match(/ba{2,}/), str.match(/ba{2,}?/)) // ['baaaa']  ['baa']
console.log(str.match(/ba?/), str.match(/ba??/)) // ['ba']  ['b']
console.log(str.match(/ba*/), str.match(/ba*?/)) // ['baaaa']  ['b']
console.log(str.match(/ba+/), str.match(/ba+?/)) // ['baaaa']  ['ba']

4.10 分组()与反向引用(非捕获组 ?:)

// 一、分组（子表达式）可以理解为，数学运算中的括号，用于计算的分组使用
console.log(/abc{2}/.exec('1abcc2')) // ['abcc', index: 1, input: '1abcc2']
console.log(/(\d{4})[/-](\d{2})[/-](\d{2})/.exec('2021-12-01'))
// 上面的返回值 ["2021-12-01", "2021", "12", "01", index: 0, input: "2021-12-01"]
// 1、对比上面两个返回结果可以看出，第二个带分组的，会在匹配结果里返回每个分组匹配到的结果，即'2021','12','01'
// 2、使用组匹配时，不宜同时使用g修饰符，否则match方法不会捕获分组的内容(但exec方法不受影响)
console.log('2021-12-01'.match(/(\d{4})[/-](\d{2})[/-](\d{2})/g)) // ["2021-12-01"]
console.log('2021-12-01'.match(/(\d{4})[/-](\d{2})[/-](\d{2})/)) // ["2021-12-01", "2021", "12", "01"]
console.log(/(\d{4})[/-](\d{2})[/-](\d{2})/g.exec('2021-12-01')) // ["2021-12-01", "2021", "12", "01"]
console.log(/(\d{4})[/-](\d{2})[/-](\d{2})/.exec('2021-12-01')) // ["2021-12-01", "2021", "12", "01"]
// 3、分组的用途
'2021-12-01'.replace(/(\d{4})[/-](\d{2})[/-](\d{2})/, '$1+$2+$3') // 2021+12+01
'2021-12-01'.replace(/(\d{4})[/-](\d{2})[/-](\d{2})/, ($1,$2,$3,$4) => { // 2021+12+01
  return `${$2}+${$3}+${$4}`
})
// 注：如果replace第二个参数是function，则分组匹配结果时候从$2开始，$1是整个匹配到的结果
// 4、\n：表示第n个括号匹配的内容
// 用于匹配网页标签 ["<p class="cs">hello</p>", "p", " class="cs"]
console.log(/<([^>]+)([^>]*)>[^<]*<\/\1>/.exec('<p class="cs">hello</p>'))
// \n是内容完全匹配，而不仅是格式匹配
console.log(/(.)+.*\1/.exec('hello world')) // ["hello wo", "o"]
console.log(/(.).*\1/.exec('hello world')) // ["llo worl", "l"]
// 分析：上面两者之所以分组中匹配到的不一样，是因为正则默认是"贪婪模式"，可以改为懒惰模式
console.log(/(.)+?.*\1/.exec('hello world')) // ["hello worl", "l"]
// 二、反向引用(非捕获组 ?:) - 放在分组最前面
// 表示不返回该组匹配的内容，即匹配的结果中不计入这个括号里匹配到的结果
console.log(/^(?:\d{4})[/-](\d{2})[/-](\d{2})$/.exec('2021-12-01')) // ["2021-12-01", "12", "01"]
console.log(/^(\d{4})[/-](\d{2})[/-](\d{2})$/.exec('2021-12-01')) // ["2021-12-01", "2021", "12", "01"]

4.11 零宽断言

// 1、前瞻lookahead(?= 先行断言、?! 先行否定断言、负向先行断言)
'hello world'.match(/l(?=o)/) // ["l", index: 3]
// x(?=y)：表示x后面必须有y，才能匹配上；括号里的部分y不会返回
// x(?!y)：相反，表示x后面必须没有y，才能匹配上；括号里的部分y不会返回
'hello world'.replace(/l(?=o)/g, ' ') // "hel o world"
'hello world'.replace(/l(?!o)/g, ' ') // "he lo wor d"
// 2、后顾lookbehind(?<= 后行断言、?<! 后行否定断言、负向后行断言) - 慎用，有些平台不支持
// (?<=y)x：表示x前面必须有y，才能匹配上；括号里的部分y不会返回
// (?<!y)x：相反，表示x前面必须没有y，才能匹配上；括号里的部分y不会返回
console.log('hello world'.replace(/(?<=e)l/g, ' ')) // "he lo world"
console.log('hello world'.replace(/(?<!e)l/g, ' ')) // "hel o wor d"
// 3、前面讲到，前瞻是匹配x后面有没有y，那如果不写x呢？(相当于indexOf，但exec不会匹配结果)
console.log(/(?=abc)/.exec("cbacbac"), /(?=abc)/.test("cbacbac")) // null false
console.log(/(?=abc)/.exec("cbacabc"), /(?=abc)/.test("cbacabc")) // ["", index: 4] true
// 4、那么对于后顾，如果不带x，我们能做什么呢？
console.log(/(?!\d)/.exec("123"), /(?!\d)/.test("123")) // ["", index: 3] true
// 进一步改造上面这个不带x的后顾正则 - 匹配不全是某种规则的字符
console.log(/^(?!\d+$)/.exec('123'), /^(?!\d+$)/.test('123')) // null false
// 使用场景，金额每三位用,分隔
'1234567'.replace(/(\d)(?=(\d{3})+$)/g, '$1,') // "1,234,567"
'1234567'.match(/(\d)(?=(\d{3})+$)/g) // ["1", "4"]

五、常用正则举例

// 1、手机号
reg = /^1[3456789]\d{9}$/
// 2、身份证(15位数字 或 17位数字+数字或x)
reg = /^[1-9]\d{14}(\d{2}[0-9xX])?$/
// 3、密码 - 数字、字母、下划线最少两种，8-16位：https://juejin.cn/post/6946016272245063716#heading-4
reg = /^(?!(\d+|[a-zA-Z]+|_+)$)\w{8,16}$/
reg = /^(?!\d+$)(?![a-zA-Z]+$)(?!_+$)\w{8,16}$/ // 等价
// 前瞻并不支持ios14以下，所以在H5上，为了兼容，最好用方法
fun = (str) => {
  const match = [/[A-Za-z]/, /\d/, /_/].filter(i => i.test(str)).length >= 2
  return match && /^\w{8,16}$/.test(str)
}
// 4、计算字符串字节数 - 中文占2字节，其余的1字节
fun = (str) => str.replace(/\u4e00-\u9fa5/g, 'aa').length
// 5、多个空格换成一个
fun = (str) => str.replace(/\s+/g, ' ')
// 6、是否包含连续相同的字符串 - 前瞻
reg = /(?=(\w+)\1)/ // /(?=(\w+)\1)/.test("cbacbabc") true
// 7、最多两位小数
reg = /^\d*\.?\d{0,2}$/
// 8、去除所有的html标签
fun = (str) => str.replace(/<[^>]+>/gi, '')
// 9、获取url参数
fun = (name) => {
  const r = window.location.search.substr(1)
  const eg = new RegExp('(^|&)' + name + '=([^&]*)(&|$)')

  return r.match(eg) ? r.match(eg)[2] : ''
}
// 10、阿拉伯数字替换成对应的中文大写
arr = ["零","壹","贰","叁","肆","伍","陆","柒","捌","玖"]
fun = (str) => str.replace(/\d/g, (m) => {
  return arr[m]
})