正则表达式与Python的re模块

一.re模块的match()函数

1.match()函数若匹配成功返回一个Match对象，Match对象的方法

1.group()或者group()

作用：获取匹配的字符串

python 复制代码

text3 = "123-7894-5201"
pattern = r'\d{3}-\d{4}-\d{4}'
matcher = re.match(pattern,text3)  #re.match()用于从字符串开头开始匹配，如果开头不匹配则立即返回None，只有在字符串起始位置匹配成功才返回匹配对象

if matcher:
    # print(matcher) # <re.Match object; span=(0, 13), match='123-7894-5201'>
    # print(type(matcher)) # <class 're.Match'>
    print(matcher.group()) # 123-7894-5201  使用Match对象的group()方法获取匹配的字符串
else: 
    print("匹配失败!")

python 复制代码

text = "我的电话：123-4567-8989"
pattern = "\\d{3}-\\d{4}-\\d{4}" # 使用转义反斜杠\   \\d中第一个\表示转移反斜杠，告诉解释器把第二个反斜杠看成普通的字符，不要当成转义字符来处理
matcher = re.match(pattern,text)
print(matcher) # 输出None

二.re模块的search()函数

作用：扫描整个字符串，在字符串的任意一个位置查找第一个匹配项。只要字符串中存在模式就返回匹配对象

python 复制代码

text = "我的电话：123-4567-8989"
pattern = "[0-9]{3}-[0-9]{4}-[0-9]{4}"
matcher = re.search(pattern,text)
if matcher:
    print(type(matcher)) # <class 're.Match'>
    print(matcher.group()) # 123-4567-8989
else:
    print("匹配失败！")

三.re模块中的findall()函数

1.findall()函数的作用

用于在字符串中查找所有匹配正则表达式的子串，并返回一个列表；如果模式中有分组，则返回分组匹配的列表；如果有多个分组，则返回元组形式的列表

2.findall()函数的语法

re.findall(pattern,string,flags=0)

参数说明：

pattern：正则表达式模式
string：要匹配的字符串
flags：可选标志，如re.IGNORECASE等

python 复制代码

text = "苹果5元，香蕉6元，橘子3元"
pattern = r"\d+"
result = re.findall(pattern,string=text)
print(result) # 输出['5', '6', '3']

python 复制代码

text = "有两个数字：123和4567"
pattern = r"\d+"
res = re.findall(pattern,text) 
print(res) # ['123', '4567']
print(type(res)) # <class 'list'>

python 复制代码

text = "苹果5元，香蕉6元，橘子3元"
pattern = r"\d+元"
res = re.findall(pattern,text)
print(res) # ['5元', '6元', '3元']

注意：\w匹配单个字符。如果使用的是Unicode字符串，\w还会匹配其他语言的字母（比如中文、日文），但是在ASCII中，只能匹配到下划线、大小写英文字母、数字0到9

python 复制代码

# 查找所有单词
text = "Hello World! This is Python,I love it very much."
pattern = r"\w+" # 预定义字符\w表示匹配：下划线_、大小写英文字母、数字0~9

res = re.findall(pattern,text)
print(res) # ['Hello', 'World', 'This', 'is', 'Python', 'I', 'love', 'it', 'very', 'much']

ruby 复制代码

# 分组匹配的特殊行为
text = "姓名：张三，年龄：25，姓名：李四，年龄：30"
# 没有分组
matches = re.findall(r"姓名：\w+，年龄：\d+",text)
print(matches) # 输出['姓名：张三，年龄：25', '姓名：李四，年龄：30']

python 复制代码

# 有单个分组，返回分组内容
text = "姓名：张三，年龄：25，姓名：李四，年龄：30"
matches = re.findall(r"姓名：(\w+)",text) 
print(matches) # # 输出['张三', '李四']

python 复制代码

# 有多个分组，返回元组列表
text = "姓名：张三，年龄：25，姓名：李四，年龄：30"
matches = re.findall(r"姓名：(\w+)，年龄：(\d+)",text)
print(matches) # [('张三', '25'), ('李四', '30')]

python 复制代码

text = "Apple, banana, APPLE, BANANA"
#忽略大小写
fruits = re.findall("apple",text,re.IGNORECASE)
print(fruits) # ['Apple', 'APPLE']

python 复制代码

# 多行模式
text = "第一行\n第二行\n第三行"
lines = re.findall("^第.行",text,re.MULTILINE) # 元字符.表示匹配除换行符以外的任意一个字符
print(lines) # ['第一行', '第二行', '第三行'

三.正则表达式元字符之一：括号()->分组，匹配子组

在正则表达式中，()表示一个捕获分组

有两个主要作用：

分组：将多个模式元素组合在一起，作为一个整体处理
保存匹配的文本，以便后续引用或提取

基本用法：

python 复制代码

#分组应用量词
text = "ababab abc"
#将"ab"作为一组，应用量词{3}
matches = re.findall(r"(ab){3}",text)
print(matches) # ['ab']  返回最后一个捕获组的内容

# 使用非捕获组获取完整匹配
matches = re.findall(r"(?:ab){3}",text) #
print(matches) # ['ababab']

cpp 复制代码

#提取特定部分
text = "日期：2025-10-22 时间：22:37:30"
#提取日期和时间部分
dates = re.findall(r"日期：(\d{4}-\d{2}-\d{2})",text)
times = re.findall(r"时间：(\d{2}:\d{2}:\d{2})",text)
print(dates) # ['2025-10-22']
print(times) # ['22:37:30']

python 复制代码

#在匹配对象中的使用
# 访问捕获组的内容
text = "姓名：张三，年龄：25"
pattern = r"姓名：(\w+)，年龄：(\d+)"
matches = re.search(pattern,text)
# print(matches) #<re.Match object; span=(0, 11), match='姓名：张三，年龄：25'>  span是一个元组，表示匹配到的文本在原始字符串中的位置范围（索引），元组内的第一个元素包括，第二个元素不包括
# print(type(matches)) #<class 're.Match'>

print("完整匹配：",matches.group()) # 完整匹配： 姓名：张三，年龄：25
print("另一种方式获取整个匹配好的子串：",matches.group(0)) # 另一种方式获取整个匹配好的子串： 姓名：张三，年龄：25
print(matches.groups()) # ('张三', '25')  .groups()方法返回的是元组类型，元素是捕获到的文本数据。可以使用序列解包来获取捕获到的数据
print(matches.group()) # 姓名：张三，年龄：25
print(type(matches.groups())) # <class 'tuple'>
print(type(matches.group())) # <class 'str'>

print(f"匹配范围: {matches.span()}") # 匹配范围: (0, 11)
print(f"匹配开始的位置: {matches.start()}") # 0
print(f"匹配结束的位置: {matches.end()}") # 11

# 验证匹配内容
start, end = matches.span()
print(text[start:end]) # 姓名：张三，年龄：25

python 复制代码

#在findall()函数中的行为
#有捕获分组时
text = "admin@example.com, sales@company.org"
emails = re.findall(r"\w+@\w+\.(com|org)",text)
print(emails) # ['com', 'org']

emails = re.findall(r"(\w+)@(\w+)\.(com|org)",text)
print(emails) # [('admin', 'example', 'com'), ('sales', 'company', 'org')]

1.与 `(?:)` 非捕获分组的对比

特性	捕获分组 `()`	非捕获分组 `(?:)`
是否保存匹配文本	是	否
是否占用分组编号	是	否
在 `findall()` 中的行为	返回分组内容	不影响返回结果
性能	稍慢（需要存储）	稍快

python 复制代码

#无捕获分组时
text = "admin@example.com, sales@company.org"
emails = re.findall(r"\w+@\w+\.(?:com|org)",text)
print(emails) # ['admin@example.com', 'sales@company.org']

实际应用示例：

1.提取URL各部分

python 复制代码

#提取URL各部分
url = "https://www.example.com/path/to/page"
pattern = r"(https?)://(\w+.\w+.\w+)(/.+)"
matches = re.findall(pattern,url)
print(matches) # [('https', 'www.example.com', '/path/to/page')]
matcher = re.search(pattern,url)
if matcher:
    protocol = matcher.group(1)
    domain = matcher.group(2)
    path = matcher.group(3)
    print(f"协议: {protocol}, 域名: {domain}, 路径: {path}") # 协议: https, 域名: www.example.com, 路径: /path/to/page

2.数据格式验证和提取

python 复制代码

import re

text = "订单号: ORD-2023-12345, 金额: $150.75；订单号：ABC-2025-10989，金额：￥2025.35；订单号：VDE-20000-14576，金额：$51.10"


pattern = r"订单号[:：]\s*([A-Z]{3}-\d{4}-\d{5})[,，]\s*金额[:：]\s*([$￥]\d+\.\d{2})"
### \s表示匹配任意空白字符
# matches = re.search(pattern, text)

# if matches:
#     print(matches.groups())
#     order_id = matches.group(1)
#     amount = matches.group(2)
#     print(f"订单号：{order_id}, 金额：{amount}")
# else:
#     print("未找到匹配")

matches = re.findall(pattern,text) # 使用search函数只能找到第一个匹配项，而实际有两个匹配项，应该使用findall函数
print(matches) # [('ORD-2023-12345', '$150.75'), ('ABC-2025-10989', '￥2025.35')]
for order_id, amount in matches:
    print(f"订单号：{order_id}, 金额：{amount}")
"""
订单号：ORD-2023-12345, 金额：$150.75
订单号：ABC-2025-10989, 金额：￥2025.35
"""

四.元字符：字符类\[\]

\[\]表示一个字符类，用于匹配方括号中指定的任意一个字符

基本用法：

1.匹配指定字符中的任意一个

python 复制代码

text = "apple banana cherry"
# 匹配a、b或c中的任意一个字符
pattern = r"[abc]"
matches = re.findall(pattern,text)
print(matches) # ['a', 'b', 'a', 'a', 'a', 'c']

2.使用连字符"-"表示范围

python 复制代码

text = "Hello World 123"
#匹配任意小写字符
pattern1 = r"[a-z]"
matches1 = re.findall(pattern1,text)
print(matches1) # ['e', 'l', 'l', 'o', 'o', 'r', 'l', 'd']
#匹配任意数字
pattern2 = r"[0-9]"
matches2 = re.findall(pattern2,text)
print(matches2) # ['1', '2', '3']
#匹配任意大写字母
pattern3 = r"[A-Z]"
matches3 = re.findall(pattern3,text)
print(matches3) # ['H', 'W']

3.使用"-"组合多个范围

python 复制代码

text = "Hello 123 World!"
# 匹配任意字母（大小写）或数字
pattern = r"[a-zA-Z0-9]"
matches = re.findall(pattern,text)
print(matches) # ['H', 'e', 'l', 'l', 'o', '1', '2', '3', 'W', 'o', 'r', 'l', 'd']

4.特殊用法：否定字符类（这时^必须紧跟在[后面）

python 复制代码

text = "abc123!@#"
#匹配非数字字母字符
pattern = r"[^a-zA-z0-9]"
matches = re.findall(pattern,text)
print(matches) # ['!', '@', '#']

5.匹配特殊字符，大多数字符都不用转义，只用-和\要转义（表示范围、表示转义字符符号）

python 复制代码

# text = r"a+b*c-d/e\" 
# # 即使是原始字符串，也不能用反斜杠结尾，会报错
text = "a+b*c-d/e\\"
print(text) # a+b*c-d/e\
pattern = r'[+*\-/\\]' #反斜杠不能放在匹配模式的最后，否则会被解释为字符类结束符的转义，导致字符类未闭合
matches = re.findall(pattern,text)
print(matches) # ['+', '*', '-', '/', '\\']

五.元字符：^

1.在字符类外部：表示匹配字符串开头

python 复制代码

text = "hello world"
#^在字符类外部，表示匹配字符串开头
pattern = r"^hello"
matches = re.findall(pattern,text)
print(matches) # ['hello']
pattern2 = r"^world"
matches2 = re.findall(pattern2,text)
print(matches2) #[] 匹配失败，因为字符串text不以"world"开头

2.出现字符类内部：否定字符类

当^出现在字符类内部且是字符类内部的第一个字符时，它表示否定字符类，即匹配不包含在方括号内的任何字符

python 复制代码

text = "abc/def-ghi"
pattern = r"[^/]+"  # 匹配一个或多个非斜杠字符
matches = re.findall(pattern,text)
print(matches) # ['abc', 'def-ghi']

正则表达式与Python的re模块

一.re模块的match()函数

1.match()函数若匹配成功返回一个Match对象，Match对象的方法

1.group()或者group()

二.re模块的search()函数

三.re模块中的findall()函数

1.findall()函数的作用

2.findall()函数的语法

三.正则表达式元字符之一：括号()->分组，匹配子组

1.与 (?:) 非捕获分组的对比

四.元字符：字符类\[\]

1.匹配指定字符中的任意一个

2.使用连字符"-"表示范围

3.使用"-"组合多个范围

4.特殊用法：否定字符类（这时^必须紧跟在[后面）

5.匹配特殊字符，大多数字符都不用转义，只用-和\要转义（表示范围、表示转义字符符号）

五.元字符：^

1.在字符类外部：表示匹配字符串开头

2.出现字符类内部：否定字符类

1.与 `(?:)` 非捕获分组的对比